diff options
Diffstat (limited to 'fs')
452 files changed, 18492 insertions, 10310 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fc7efd0a7525..c9798750202d 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -16,7 +16,7 @@ config BCACHEFS_FS select ZSTD_COMPRESS select ZSTD_DECOMPRESS select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO_CHACHA20 select CRYPTO_POLY1305 select KEYS diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d2689388d5e8..9af65079374f 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -41,7 +41,6 @@ bcachefs-y := \ extent_update.o \ eytzinger.o \ fs.o \ - fs-common.o \ fs-ioctl.o \ fs-io.o \ fs-io-buffered.o \ @@ -64,9 +63,11 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + namei.o \ nocow_locking.o \ opts.o \ printbuf.o \ + progress.o \ quota.o \ rebalance.o \ rcu_pending.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 3ea809990ef1..c12ca7538e4f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v2_unpack_error, + c, alloc_v3_unpack_error, "unpack error"); fsck_err: return ret; @@ -589,6 +589,8 @@ iter_err: int bch2_alloc_read(struct bch_fs *c) { + down_read(&c->state_lock); + struct btree_trans *trans = bch2_trans_get(c); struct bch_dev *ca = NULL; int ret; @@ -652,6 +654,7 @@ int bch2_alloc_read(struct bch_fs *c) bch2_dev_put(ca); bch2_trans_put(trans); + up_read(&c->state_lock); bch_err_fn(c, ret); return ret; } @@ -673,8 +676,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, alloc_k); int ret = __bch2_fsck_err(NULL, trans, flags, err_id, - "bucket incorrectly %sset in %s btree\n" - " %s", + "bucket incorrectly %sset in %s btree\n%s", set ? "" : "un", bch2_btree_id_str(btree), buf.buf); @@ -777,14 +779,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, @@ -837,7 +837,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -871,6 +871,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + if (new_a->oldest_gen == new_a->gen && + !bch2_bucket_sectors_total(*new_a)) + new_a->oldest_gen++; new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); alloc_data_type_set(new_a, new_a->data_type); @@ -889,26 +892,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); - u64 old_lru = alloc_lru_idx_read(*old_a); - u64 new_lru = alloc_lru_idx_read(*new_a); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) + goto err; - old_lru = alloc_lru_idx_fragmentation(*old_a, ca); - new_lru = alloc_lru_idx_fragmentation(*new_a, ca); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, + BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(new.k->p), + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) + goto err; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); @@ -1032,9 +1029,9 @@ fsck_err: bch2_dev_put(ca); return ret; invalid_bucket: - bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } @@ -1206,8 +1203,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), trans, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", + "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -1265,7 +1261,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (fsck_err_on(k.k->type != KEY_TYPE_set, trans, freespace_hole_missing, "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", + "device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, end->offset)) { @@ -1424,7 +1420,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a))) { if (fsck_err(trans, need_discard_freespace_key_bad, - "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), iter->pos.inode, @@ -1505,7 +1501,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); if (!ca) { if (fsck_err(trans, bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", + "bucket_gens key for invalid device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); goto out; @@ -1514,7 +1510,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, trans, bucket_gens_to_invalid_buckets, - "bucket_gens key for invalid buckets:\n %s", + "bucket_gens key for invalid buckets:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto out; @@ -1705,7 +1701,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) goto err; @@ -1716,8 +1713,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (fsck_err_on(!a->io_time[READ], trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n" - " %s", + "cached bucket with read_time 0\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_alloc_v4 *a_mut = @@ -1735,7 +1731,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; @@ -1757,7 +1755,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: + bch2_check_stripe_to_lru_refs(c); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); @@ -1805,6 +1804,19 @@ struct discard_buckets_state { u64 discarded; }; +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1868,7 +1880,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (ca->mi.discard && !c->opts.nochanges) { + if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree @@ -1897,7 +1909,10 @@ commit: if (ret) goto out; - count_event(c, bucket_discard); + if (!fastpath) + count_event(c, bucket_discard); + else + count_event(c, bucket_discard_fast); out: fsck_err: if (discard_locked) @@ -2055,16 +2070,71 @@ put_ref: bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } +static int invalidate_one_bp(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + struct btree_iter extent_iter; + struct bkey_s_c extent_k = + bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); + int ret = bkey_err(extent_k); + if (ret) + return ret; + + struct bkey_i *n = + bch2_bkey_make_mut(trans, &extent_iter, &extent_k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); +err: + bch2_trans_iter_exit(trans, &extent_iter); + return ret; +} + +static int invalidate_one_bucket_by_bps(struct btree_trans *trans, + struct bch_dev *ca, + struct bpos bucket, + u8 gen, + struct bkey_buf *last_flushed) +{ + struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); + struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); + + return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + bp_start, bp_end, 0, k, + NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (bp.v->bucket_gen != gen) + continue; + + /* filter out bps with gens that don't match */ + + invalidate_one_bp(trans, ca, bp, last_flushed); + })); +} + +noinline_for_stack static int invalidate_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *lru_iter, struct bkey_s_c lru_k, + struct bkey_buf *last_flushed, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - unsigned cached_sectors; + struct btree_iter alloc_iter = {}; int ret = 0; if (*nr_to_invalidate <= 0) @@ -2081,35 +2151,37 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); - ret = PTR_ERR_OR_ZERO(a); + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + ret = bkey_err(alloc_k); if (ret) - goto out; + return ret; + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) goto out; - BUG_ON(a->v.data_type != BCH_DATA_cached); - BUG_ON(a->v.dirty_sectors); + /* + * Impossible since alloc_lru_idx_read() only returns nonzero if the + * bucket is supposed to be on the cached bucket LRU (i.e. + * BCH_DATA_cached) + * + * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 + */ + BUG_ON(a->data_type != BCH_DATA_cached); + BUG_ON(a->dirty_sectors); - if (!a->v.cached_sectors) + if (!a->cached_sectors) bch_err(c, "invalidating empty bucket, confused"); - cached_sectors = a->v.cached_sectors; - - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.stripe_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = bch2_current_io_time(c, READ); - a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); + unsigned cached_sectors = a->cached_sectors; + u8 gen = a->gen; - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); if (ret) goto out; @@ -2117,6 +2189,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, --*nr_to_invalidate; out: fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; } @@ -2143,6 +2216,10 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); int ret = 0; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; @@ -2167,7 +2244,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (!k.k) break; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); restart_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -2180,6 +2257,7 @@ restart_err: err: bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); + bch2_bkey_buf_exit(&last_flushed, c); bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index de25ba4ee94b..c556ccaffe89 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (bch2_bucket_sectors_dirty(a)) - return data_type; + return bucket_data_type(data_type); if (a.cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5a781fb4c794..da0d72928b5b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) @@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - static inline bool may_alloc_bucket(struct bch_fs *c, struct bpos bucket, struct bucket_alloc_state *s) @@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -648,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct bch_dev_usage usage; struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + cl, flags & BCH_WRITE_alloc_nowait, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -1336,7 +1319,7 @@ retry: if (wp->data_type != BCH_DATA_user) have_cache = true; - if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + if (target && !(flags & BCH_WRITE_only_specified_devs)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1426,7 +1409,7 @@ err: if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; - if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && + if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; @@ -1577,7 +1560,7 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, unsigned i; prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + prt_human_readable_u64(out, wp->sectors_allocated << 9); prt_printf(out, " last wrote: "); bch2_pr_time_units(out, sched_clock() - wp->last_used); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index f25481a0d1a0..69ec6a012898 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) return bch2_dev_have_ref(c, ob->dev); } +static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_interior_updates: + return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum bch_watermark, enum bch_data_type, struct closure *); @@ -65,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4aa8ee026cb8..8f79f46c2a78 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -90,6 +90,7 @@ struct dev_stripe_state { x(stopped) \ x(waiting_io) \ x(waiting_work) \ + x(runnable) \ x(running) enum write_point_state { @@ -125,6 +126,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; + u64 last_runtime; } __aligned(SMP_CACHE_BYTES); }; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ebeb6a5ff9d2..21d1d86d5008 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -11,6 +11,7 @@ #include "checksum.h" #include "disk_accounting.h" #include "error.h" +#include "progress.h" #include <linux/mm.h> @@ -49,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -93,6 +96,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + int ret = 0; if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); @@ -122,17 +126,15 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); } - printbuf_exit(&buf); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && + __bch2_inconsistent_error(c, &buf)) + ret = -BCH_ERR_erofs_unfixed_errors; - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; - } else { - return 0; - } + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return ret; } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, @@ -207,11 +209,11 @@ static int backpointer_target_not_found(struct btree_trans *trans, if (ret) return ret; - prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + prt_printf(&buf, "backpointer doesn't match %s it points to:\n", bp.v->level ? "btree node" : "extent"); bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, target_k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); @@ -219,7 +221,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) if (p.ptr.dev == bp.k->p.inode) { - prt_printf(&buf, "\n "); + prt_newline(&buf); struct bkey_i_backpointer bp2; bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); @@ -244,27 +246,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) return bkey_s_c_null; - if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level, + iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; - bch2_trans_iter_exit(trans, iter); + bch2_trans_iter_exit(trans, iter); + + if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); @@ -436,12 +442,11 @@ found: if (ret) goto err; - prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); - prt_printf(&buf, "\n "); + prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); bch2_btree_id_to_text(&buf, btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_btree_id_to_text(&buf, o_btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent2); @@ -514,11 +519,27 @@ check_existing_bp: if (!other_extent.k) goto missing; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); + if (ca) { + struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); + bkey_for_each_ptr(other_extent_ptrs, ptr) + if (ptr->dev == bp->k.p.inode && + dev_ptr_stale_rcu(ca, ptr)) { + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); + if (ret) + goto err; + goto out; + } + } + rcu_read_unlock(); + if (bch2_extents_match(orig_k, other_extent)) { printbuf_reset(&buf); - prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); @@ -557,20 +578,20 @@ check_existing_bp: } printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; missing: printbuf_reset(&buf); - prt_str(&buf, "missing backpointer\n for: "); + prt_str(&buf, "missing backpointer\nfor: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\n want: "); + prt_printf(&buf, "\nwant: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_printf(&buf, "\n got: "); + prt_printf(&buf, "\ngot: "); bch2_bkey_val_to_text(&buf, c, bp_k); if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) @@ -590,9 +611,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; @@ -600,9 +618,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + + bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); rcu_read_unlock(); - if (check || empty) { + if ((check || empty) && !stale) { struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); @@ -715,71 +735,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -static inline void progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) -{ - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = i, - }; - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -static void progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -} - static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -787,7 +742,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct progress_indicator_state progress; int ret = 0; - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); @@ -806,7 +761,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -827,7 +782,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -836,9 +791,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } @@ -889,7 +845,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) @@ -901,9 +861,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b goto err; } - /* Cached pointers don't have backpointers: */ - if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); @@ -912,6 +871,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -BCH_ERR_transaction_restart_nested; @@ -919,7 +879,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe]) + !sectors[ALLOC_stripe] && + !sectors[ALLOC_cached]) __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); else __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); @@ -1060,7 +1021,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) * Can't allow devices to come/go/resize while we have bucket bitmaps * allocated */ - lockdep_assert_held(&c->state_lock); + down_read(&c->state_lock); for_each_member_device(c, ca) { BUG_ON(ca->bucket_backpointer_mismatches); @@ -1145,6 +1106,7 @@ err_free_bitmaps: ca->bucket_backpointer_mismatches = NULL; } + up_read(&c->state_lock); bch_err_fn(c, ret); return ret; } @@ -1206,11 +1168,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, ({ - progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 060dad1521ee..16575dbc5736 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H -#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#ifndef _BCACHEFS_BACKPOINTERS_H +#define _BCACHEFS_BACKPOINTERS_H #include "btree_cache.h" #include "btree_iter.h" @@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, return BCH_DATA_btree; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + if (p.has_ec) + return BCH_DATA_stripe; + if (p.ptr.cached) + return BCH_DATA_cached; + else + return BCH_DATA_user; case KEY_TYPE_stripe: { const struct bch_extent_ptr *ptr = &entry->ptr; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); @@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bkey_i_backpointer *bp) { bkey_backpointer_init(&bp->k_i); - bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->k.p.inode = p.ptr.dev; + + if (k.k->type != KEY_TYPE_stripe) + bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; + else { + /* + * Put stripe backpointers where they won't collide with the + * extent backpointers within the stripe: + */ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; + } + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 161cf2f05d2a..f52311017aee 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -203,6 +203,7 @@ #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> +#include <linux/unicode.h> #include "bcachefs_format.h" #include "btree_journal_iter_types.h" @@ -444,6 +445,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_sort) \ x(btree_node_read) \ x(btree_node_read_done) \ + x(btree_node_write) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ @@ -456,6 +458,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_journal_max_open) \ x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ @@ -533,6 +536,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -623,7 +627,8 @@ struct bch_dev { x(topology_error) \ x(errors_fixed) \ x(errors_not_fixed) \ - x(no_invalid_checks) + x(no_invalid_checks) \ + x(discard_mount_opt_set) \ enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -687,7 +692,8 @@ struct btree_trans_buf { x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ - x(btree_write_buffer) + x(btree_write_buffer) \ + x(btree_node_scrub) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -696,6 +702,8 @@ enum bch_write_ref { BCH_WRITE_REF_NR, }; +#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) + struct bch_fs { struct closure cl; @@ -780,6 +788,9 @@ struct bch_fs { u64 btrees_lost_data; } sb; +#ifdef CONFIG_UNICODE + struct unicode_map *cf_encoding; +#endif struct bch_sb_handle disk_sb; @@ -969,7 +980,6 @@ struct bch_fs { mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; @@ -993,15 +1003,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f70f0108401f..a3db328dee31 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -686,7 +686,12 @@ struct bch_sb_field_ext { x(inode_depth, BCH_VERSION(1, 17)) \ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) + x(directory_size, BCH_VERSION(1, 20)) \ + x(cached_backpointers, BCH_VERSION(1, 21)) \ + x(stripe_backpointers, BCH_VERSION(1, 22)) \ + x(stripe_lru, BCH_VERSION(1, 23)) \ + x(casefolding, BCH_VERSION(1, 24)) \ + x(extent_flags, BCH_VERSION(1, 25)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -837,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +/* one free bit */ LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -855,6 +861,8 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -908,7 +916,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(journal_no_flush, 16) \ x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) + x(incompat_version_field, 19) \ + x(casefolding, 20) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ @@ -922,7 +931,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u BIT_ULL(BCH_FEATURE_new_siphash)| \ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_journal_no_flush)| \ + BIT_ULL(BCH_FEATURE_incompat_version_field)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1133,7 +1143,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(log, 9) \ x(overwrite, 10) \ x(write_buffer_keys, 11) \ - x(datetime, 12) + x(datetime, 12) \ + x(log_bkey, 13) enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 3c23bdf788ce..52594e925eb7 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -87,6 +87,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) +#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -215,6 +216,10 @@ struct bch_ioctl_data { union { struct { __u32 dev; + __u32 data_types; + } scrub; + struct { + __u32 dev; __u32 pad; } migrate; struct { @@ -229,6 +234,11 @@ enum bch_data_event { BCH_DATA_EVENT_NR = 1, }; +enum data_progress_data_type_special { + DATA_PROGRESS_DATA_TYPE_phys = 254, + DATA_PROGRESS_DATA_TYPE_done = 255, +}; + struct bch_ioctl_data_progress { __u8 data_type; __u8 btree_id; @@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; @@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { struct bkey_i_accounting accounting[]; }; +#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) + +struct bch_ioctl_query_counters { + __u16 nr; + __u16 flags; + __u32 pad; + __u64 d[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1ec1f90e0eb3..9b80201c7982 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && @@ -1416,7 +1417,7 @@ void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "%u", r->level); else prt_printf(out, "(unknown)"); - prt_printf(out, "\n "); + prt_newline(out); bch2_bkey_val_to_text(out, c, k); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dd1d9b74076e..2025d408979c 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -27,6 +27,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "progress.h" #include "recovery_passes.h" #include "reflink.h" #include "recovery.h" @@ -212,15 +213,15 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * prt_printf(&buf, " at "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\n parent: "); + prt_printf(&buf, ":\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (prev) { - prt_printf(&buf, "\n prev: "); + prt_printf(&buf, "\nprev: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - prt_str(&buf, "\n next: "); + prt_str(&buf, "\nnext: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ @@ -279,12 +280,12 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, if (bpos_eq(child->key.k.p, b->key.k.p)) return 0; - prt_printf(&buf, " at "); + prt_printf(&buf, "\nat: "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\n parent: "); + prt_printf(&buf, "\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n child: "); + prt_str(&buf, "\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, @@ -350,8 +351,7 @@ again: if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), trans, btree_node_read_error, - "Topology repair: unreadable btree node at\n" - " %s", + "Topology repair: unreadable btree node at\n%s", buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; @@ -611,7 +611,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (fsck_err_on(btree_id != BTREE_ID_accounting && k.k->bversion.lo > atomic64_read(&c->key_version), trans, bkey_version_in_future, - "key version number higher than recorded %llu\n %s", + "key version number higher than recorded %llu\n%s", atomic64_read(&c->key_version), (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) atomic64_set(&c->key_version, k.k->bversion.lo); @@ -619,7 +619,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), trans, btree_bitmap_not_marked, - "btree ptr not marked in member info btree allocated bitmap\n %s", + "btree ptr not marked in member info btree allocated bitmap\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -656,7 +656,9 @@ fsck_err: return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) +static int bch2_gc_btree(struct btree_trans *trans, + struct progress_indicator_state *progress, + enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; @@ -673,6 +675,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); @@ -717,22 +720,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - enum btree_id ids[BTREE_ID_NR]; struct printbuf buf = PRINTBUF; - unsigned i; int ret = 0; - for (i = 0; i < BTREE_ID_NR; i++) + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, ~0ULL); + + enum btree_id ids[BTREE_ID_NR]; + for (unsigned i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { unsigned btree = i < BTREE_ID_NR ? ids[i] : i; if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, btree, true); + ret = bch2_gc_btree(trans, &progress, btree, true); } printbuf_exit(&buf); @@ -1015,8 +1020,7 @@ int bch2_check_allocations(struct bch_fs *c) { int ret; - lockdep_assert_held(&c->state_lock); - + down_read(&c->state_lock); down_write(&c->gc_lock); bch2_btree_interior_updates_flush(c); @@ -1054,6 +1058,7 @@ out: percpu_up_write(&c->mark_lock); up_write(&c->gc_lock); + up_read(&c->state_lock); /* * At startup, allocations can happen directly instead of via the diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 756736f9243d..1d94a2bf706d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" #include "btree_cache.h" @@ -524,8 +525,6 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - printbuf_indent_add(out, 2); - prt_printf(out, "\nnode offset %u/%u", b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) @@ -549,32 +548,39 @@ static int __btree_err(int ret, enum bch_sb_error_id err_type, const char *fmt, ...) { - struct printbuf out = PRINTBUF; bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; - va_list args; + + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) + ret = -BCH_ERR_btree_node_read_err_fixable; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) + ret = -BCH_ERR_btree_node_read_err_bad_node; + + if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) + bch2_sb_error_count(c, err_type); + + struct printbuf out = PRINTBUF; + if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { + printbuf_indent_add_nextline(&out, 2); +#ifdef BCACHEFS_LOG_PREFIX + prt_printf(&out, bch2_log_msg(c, "")); +#endif + } btree_err_msg(&out, c, ca, b, i, k, b->written, write); + va_list args; va_start(args, fmt); prt_vprintf(&out, fmt, args); va_end(args); if (write == WRITE) { - bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = c->opts.errors == BCH_ON_ERROR_continue - ? 0 - : -BCH_ERR_fsck_errors_not_fixed; - goto out; + prt_str(&out, ", "); + ret = __bch2_inconsistent_error(c, &out) + ? -BCH_ERR_fsck_errors_not_fixed + : 0; + silent = false; } - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = -BCH_ERR_btree_node_read_err_fixable; - if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; - - if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) - bch2_sb_error_count(c, err_type); - switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret = !silent @@ -584,25 +590,21 @@ static int __btree_err(int ret, ret != -BCH_ERR_fsck_ignore) goto fsck_err; ret = -BCH_ERR_fsck_fix; - break; - case -BCH_ERR_btree_node_read_err_want_retry: - case -BCH_ERR_btree_node_read_err_must_retry: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); - break; + goto out; case -BCH_ERR_btree_node_read_err_bad_node: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = bch2_topology_error(c); + prt_str(&out, ", "); + ret = __bch2_topology_error(c, &out); + if (ret) + silent = false; break; case -BCH_ERR_btree_node_read_err_incompatible: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; + silent = false; break; - default: - BUG(); } + + if (!silent) + bch2_print_string_as_lines(KERN_ERR, out.buf); out: fsck_err: printbuf_exit(&out); @@ -816,7 +818,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, NULL, btree_node_bad_format, - "invalid bkey format: %s\n %s", buf1.buf, + "invalid bkey format: %s\n%s", buf1.buf, (printbuf_reset(&buf2), bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); printbuf_reset(&buf1); @@ -1328,6 +1330,7 @@ static void btree_node_read_work(struct work_struct *work) bch_info(c, "retrying read"); ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1338,21 +1341,26 @@ static void btree_node_read_work(struct work_struct *work) } else { bio->bi_status = BLK_STS_REMOVED; } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); start: printbuf_reset(&buf); bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); + + if (ca && bio->bi_status) + bch_err_dev_ratelimited(ca, + "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); + bch2_mark_io_failure(&failed, &rb->pick, false); can_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - &failed, &rb->pick) > 0; + &failed, &rb->pick, -1) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { @@ -1400,12 +1408,11 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref + ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); queue_work(c->btree_read_complete_wq, &rb->work); } @@ -1697,7 +1704,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, return; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick); + NULL, &pick, -1); if (ret <= 0) { struct printbuf buf = PRINTBUF; @@ -1811,6 +1818,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } +struct btree_node_scrub { + struct bch_fs *c; + struct bch_dev *ca; + void *buf; + bool used_mempool; + unsigned written; + + enum btree_id btree; + unsigned level; + struct bkey_buf key; + __le64 seq; + + struct work_struct work; + struct bio bio; +}; + +static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, + struct printbuf *err) +{ + unsigned written = 0; + + if (le64_to_cpu(data->magic) != bset_magic(c)) { + prt_printf(err, "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(data->magic)); + return false; + } + + while (written < (ptr_written ?: btree_sectors(c))) { + struct btree_node_entry *bne; + struct bset *i; + bool first = !written; + + if (first) { + bne = NULL; + i = &data->keys; + } else { + bne = (void *) data + (written << 9); + i = &bne->keys; + + if (!ptr_written && i->seq != data->keys.seq) + break; + } + + struct nonce nonce = btree_nonce(i, written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); + if (bch2_crc_cmp(data->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); + return false; + } + } + + written += vstruct_sectors(data, c->block_bits); + } else { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + if (bch2_crc_cmp(bne->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); + return false; + } + } + + written += vstruct_sectors(bne, c->block_bits); + } + } + + return true; +} + +static void btree_node_scrub_work(struct work_struct *work) +{ + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; + struct printbuf err = PRINTBUF; + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { + struct btree_trans *trans = bch2_trans_get(c); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, scrub->btree, + scrub->key.k->k.p, 0, scrub->level - 1, 0); + + struct btree *b; + int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter))); + if (ret) + goto err; + + if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { + bch_err(c, "error validating btree node during scrub on %s at btree %s", + scrub->ca->name, err.buf); + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_begin(trans); + bch2_trans_put(trans); + } + + printbuf_exit(&err); + bch2_bkey_buf_exit(&scrub->key, c);; + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); + percpu_ref_put(&scrub->ca->io_ref); + kfree(scrub); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); +} + +static void btree_node_scrub_endio(struct bio *bio) +{ + struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); + + queue_work(scrub->c->btree_read_complete_wq, &scrub->work); +} + +int bch2_btree_node_scrub(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c k, unsigned dev) +{ + if (k.k->type != KEY_TYPE_btree_ptr_v2) + return 0; + + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) + return -BCH_ERR_erofs_no_writes; + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + if (!ca) { + ret = -BCH_ERR_device_offline; + goto err; + } + + bool used_mempool = false; + void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); + + unsigned vecs = buf_pages(buf, c->opts.btree_node_size); + + struct btree_node_scrub *scrub = + kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); + if (!scrub) { + ret = -ENOMEM; + goto err_free; + } + + scrub->c = c; + scrub->ca = ca; + scrub->buf = buf; + scrub->used_mempool = used_mempool; + scrub->written = btree_ptr_sectors_written(k); + + scrub->btree = btree; + scrub->level = level; + bch2_bkey_buf_init(&scrub->key); + bch2_bkey_buf_reassemble(&scrub->key, c, k); + scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; + + INIT_WORK(&scrub->work, btree_node_scrub_work); + + bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); + scrub->bio.bi_iter.bi_sector = pick.ptr.offset; + scrub->bio.bi_end_io = btree_node_scrub_endio; + submit_bio(&scrub->bio); + return 0; +err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); + percpu_ref_put(&ca->io_ref); +err: + bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + return ret; +} + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { @@ -1831,7 +2022,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_write *w = btree_prev_write(b); unsigned long old, new; @@ -1839,6 +2030,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) bch2_btree_complete_write(c, b, w); + if (start_time) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); + old = READ_ONCE(b->flags); do { new = old; @@ -1869,7 +2063,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_trans *trans = bch2_trans_get(c); @@ -1877,7 +2071,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, start_time); six_unlock_read(&b->c.lock); } @@ -1887,6 +2081,7 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + u64 start_time = wbio->start_time; int ret = 0; btree_bounce_free(c, @@ -1919,12 +2114,18 @@ static void btree_node_write_work(struct work_struct *work) } out: bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b); + btree_node_write_done(c, b, start_time); return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -1937,16 +2138,21 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - unsigned long flags; - if (wbio->have_ioref) - bch2_latency_acct(ca, wbio->submit_time, WRITE); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } - if (!ca || - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("btree")) { + if (bio->bi_status) { + unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); @@ -2023,6 +2229,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool validate_before_checksum = false; enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; + u64 start_time = local_clock(); int ret; if (flags & BTREE_WRITE_ALREADY_STARTED) @@ -2231,6 +2438,7 @@ do_write: wbio->data = data; wbio->data_bytes = bytes; wbio->sector_offset = b->written; + wbio->start_time = start_time; wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.first_btree_write = !b->written; @@ -2258,7 +2466,7 @@ err: b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, 0); } /* diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 6f9e4a6dacf7..dbf76d22c660 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -52,6 +52,7 @@ struct btree_write_bio { void *data; unsigned data_bytes; unsigned sector_offset; + u64 start_time; struct bch_write_bio wbio; }; @@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..a9c110b846b5 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, @@ -1501,22 +1487,14 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) for (struct jset_entry *e = trans->journal_entries; e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) + e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); + prt_newline(buf); + } printbuf_indent_sub(buf, 2); } -noinline __cold -void bch2_dump_trans_updates(struct btree_trans *trans) -{ - struct printbuf buf = PRINTBUF; - - bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); - printbuf_exit(&buf); -} - static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..e6f51a3b8187 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -9,7 +9,6 @@ void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); static inline int __bkey_err(const struct bkey *k) @@ -335,13 +334,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 6d25e3f85ce8..d1ad1a7613c9 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -644,6 +644,8 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, */ static int journal_sort_key_cmp(const void *_l, const void *_r) { + cond_resched(); + const struct journal_key *l = _l; const struct journal_key *r = _r; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index caef65adeae4..94eb2b73a843 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } @@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7f06deee13c..25d54b77cdc2 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -13,6 +13,7 @@ #include <linux/kthread.h> #include <linux/min_heap.h> +#include <linux/sched/sysctl.h> #include <linux/sort.h> struct find_btree_nodes_worker { @@ -166,11 +167,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); + u64 submit_time = local_clock(); submit_bio_wait(bio); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status))) + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status)); return; + } if (le64_to_cpu(bn->magic) != bset_magic(c)) return; @@ -264,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_get(&ca->io_ref); + percpu_ref_put(&ca->io_ref); closure_put(w->cl); kfree(w); return 0; @@ -283,32 +290,32 @@ static int read_btree_nodes(struct find_btree_nodes *f) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - struct task_struct *t; - if (!w) { percpu_ref_put(&ca->io_ref); ret = -ENOMEM; goto err; } - percpu_ref_get(&ca->io_ref); - closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; - t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); - closure_put(&cl); - f->ret = ret; - bch_err(c, "error starting kthread: %i", ret); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); break; } + + closure_get(&cl); + percpu_ref_get(&ca->io_ref); + wake_up_process(t); } err: - closure_sync(&cl); + while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) + ; return f->ret ?: ret; } @@ -572,10 +579,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, found_btree_node_to_key(&tmp.k, &n); - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + } BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), (struct bkey_validate_context) { diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index c4f524b2ca9a..7d7e52ddde02 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -336,6 +337,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != path->cached); BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_internal_snapshot_node) && @@ -517,69 +519,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } } -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned *btree_id_updates_start) +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - bool trans_trigger_run; + unsigned sort_id_start = 0; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + while (sort_id_start < trans->nr_updates) { + unsigned i, sort_id = trans->updates[sort_id_start].sort_order; + bool trans_trigger_run; - for (unsigned i = *btree_id_updates_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id < btree_id) { - *btree_id_updates_start = i; - continue; + /* + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being + * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop + * references before they are re-added. + * + * Running triggers will append more updates to the list of + * updates as we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = sort_id_start; + i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; + i++) { + if (trans->updates[i].sort_order < sort_id) { + sort_id_start = i; + continue; + } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; } + } while (trans_trigger_run); - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - i->btree_id == btree_id && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned btree_id = 0, btree_id_updates_start = 0; - int ret = 0; - - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); - if (ret) - return ret; + sort_id_start = i; } - btree_id_updates_start = 0; - ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); - if (ret) - return ret; - #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && @@ -903,18 +881,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_journal_res_get_blocked: + if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { /* * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag @@ -922,13 +889,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; + goto out; } ret = drop_locks_do(trans, bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_CHECK)); + goto out; + } + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); @@ -950,7 +930,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(ret >= 0); break; } - +out: BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a09cbe9cd94f..77578da2d23f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) struct btree_insert_entry { unsigned flags; + u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; u8 level:4; @@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) return BIT_ULL(btree) & mask; } +static inline u8 btree_trigger_order(enum btree_id btree) +{ + switch (btree) { + case BTREE_ID_alloc: + return U8_MAX; + case BTREE_ID_stripes: + return U8_MAX - 1; + default: + return btree; + } +} + struct btree_root { struct btree *b; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 13d794f201a5..c05394f56424 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -17,7 +17,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { - return cmp_int(l->btree_id, r->btree_id) ?: + return cmp_int(l->sort_order, r->sort_order) ?: cmp_int(l->cached, r->cached) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->k->k.p, r->k->k.p); @@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, n = (struct btree_insert_entry) { .flags = flags, + .sort_order = btree_trigger_order(path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, @@ -511,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_iter_update_trigger_flags flags) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; @@ -843,6 +846,19 @@ int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) return 0; } +int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bkey_i *k) +{ + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); + bkey_copy(e->start, k); + return 0; +} + __printf(3, 0) static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 47d8690f01bf..568e56c91190 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); @@ -168,6 +170,8 @@ void bch2_trans_commit_hook(struct btree_trans *, int __bch2_trans_commit(struct btree_trans *, unsigned); int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); +int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); + __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e4e7c804625e..bf7e1dac7f46 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -35,6 +35,8 @@ static const char * const bch2_btree_update_modes[] = { NULL }; +static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -54,6 +56,8 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; + printbuf_indent_add_nextline(&buf, 2); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -64,19 +68,20 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - printbuf_reset(&buf); + ret = __bch2_topology_error(c, &buf); + bch2_bpos_to_text(&buf, b->data->min_key); log_fsck_err(trans, btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf); - goto topology_repair; + goto out; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - printbuf_reset(&buf); + ret = __bch2_topology_error(c, &buf); bch2_bpos_to_text(&buf, b->data->max_key); log_fsck_err(trans, btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf); - goto topology_repair; + goto out; } } @@ -94,20 +99,19 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "end of prev node doesn't match start of next node\n in "); + prt_str(&buf, "end of prev node doesn't match start of next node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n prev "); + prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\n next "); + prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto topology_repair; + goto out; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -115,29 +119,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "empty interior node\n in "); + prt_str(&buf, "empty interior node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - goto topology_repair; } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "last child node doesn't end at end of parent node\n in "); + prt_str(&buf, "last child node doesn't end at end of parent node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n last key "); + prt_str(&buf, "\nlast key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); - goto topology_repair; } out: fsck_err: @@ -145,9 +145,6 @@ fsck_err: bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; -topology_repair: - ret = bch2_topology_error(c); - goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -649,6 +646,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +682,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1271,7 +1268,8 @@ err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_reclaim_would_deadlock) + ret != -BCH_ERR_journal_reclaim_would_deadlock && + ret != -BCH_ERR_journal_shutdown) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1782,11 +1780,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t int ret; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(path, b->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); + if (!btree_node_intent_locked(path, b->c.level)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "%s(): node not locked at level %u\n", + __func__, b->c.level); + bch2_btree_update_to_text(&buf, as); + bch2_btree_path_to_text(&buf, trans, path_idx); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + bch2_fs_emergency_read_only(c); + return -EIO; + } + ret = bch2_btree_node_lock_write(trans, path, &b->c); if (ret) return ret; @@ -2007,18 +2018,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, } if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - bch2_bpos_to_text(&buf1, prev->data->max_key); - bch2_bpos_to_text(&buf2, next->data->min_key); - bch_err(c, - "%s(): btree topology error:\n" - " prev ends at %s\n" - " next starts at %s", - __func__, buf1.buf, buf2.buf); - printbuf_exit(&buf1); - printbuf_exit(&buf2); - ret = bch2_topology_error(c); + struct printbuf buf = PRINTBUF; + + printbuf_indent_add_nextline(&buf, 2); + prt_printf(&buf, "%s(): ", __func__); + ret = __bch2_topology_error(c, &buf); + prt_newline(&buf); + + prt_printf(&buf, "prev ends at "); + bch2_bpos_to_text(&buf, prev->data->max_key); + prt_newline(&buf); + + prt_printf(&buf, "next starts at "); + bch2_bpos_to_text(&buf, next->data->min_key); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); goto err; } @@ -2126,6 +2141,31 @@ err_free_update: goto out; } +static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b) +{ + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + ret = -BCH_ERR_btree_node_dying; + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, @@ -2191,67 +2231,81 @@ err: goto out; } -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) +static int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, - a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); struct btree *b = bch2_btree_iter_peek_node(&iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0) + ? bch2_btree_node_rewrite(trans, &iter, b, flags) : -ENOENT; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} -#if 0 - /* Tracepoint... */ - if (!ret || ret == -ENOENT) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; +int bch2_btree_node_rewrite_pos(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bpos pos, unsigned flags) +{ + BUG_ON(!level); - if (!ret) { - prt_printf(&buf, "rewrite node:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - } else { - prt_printf(&buf, "node to rewrite not found:\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - prt_printf(&buf, "\n got: "); - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null)"); - } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -#endif -out: + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); +err: bch2_trans_iter_exit(trans, &iter); return ret; } +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, + struct btree *b, unsigned flags) +{ + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + + ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; + struct bkey_buf key; +}; + static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); - if (ret != -ENOENT) + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); + if (ret != -ENOENT && + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_shutdown) bch_err_fn_ratelimited(c, ret); spin_lock(&c->btree_node_rewrites_lock); @@ -2494,30 +2548,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; - int ret; - - bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter); + int ret = get_iter_to_node(trans, &iter, b); if (ret) - goto out; - - /* has node been freed? */ - if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - - BUG_ON(!btree_node_hashed(b)); + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); -out: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 26d646e1275c..be71cd73b864 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -169,7 +169,14 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned); +int bch2_btree_node_rewrite_pos(struct btree_trans *, + enum btree_id, unsigned, + struct bpos, unsigned); +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, + struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, unsigned, bool); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 345b117a4a4a..0903311cc71e 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -381,6 +381,36 @@ err: return ret; } +static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, + struct bkey_s_c k, bool insert, enum bch_sb_error_id id) +{ + struct bch_fs *c = trans->c; + bool repeat = false, print = true, suppress = false; + + prt_printf(buf, "\nwhile marking "); + bch2_bkey_val_to_text(buf, c, k); + prt_newline(buf); + + __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + + if (insert) { + print = true; + suppress = false; + + bch2_trans_updates_to_text(buf, trans); + __bch2_inconsistent_error(c, buf); + ret = -BCH_ERR_bucket_ref_update; + } + + if (suppress) + prt_printf(buf, "Ratelimiting new instances of previous error\n"); + if (print) + bch2_print_string_as_lines(KERN_ERR, buf->buf); + return ret; +} + int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, @@ -396,32 +426,29 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BUG_ON(!sectors); - if (gen_after(ptr->gen, b_gen)) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" - "while marking %s", + if (unlikely(gen_after(ptr->gen, b_gen))) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); goto out; } - if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_too_stale, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", + if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_too_stale); goto out; } @@ -430,62 +457,50 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, goto out; } - if (b_gen != ptr->gen) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, stale_dirty_ptr, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" - "while marking %s", + if (unlikely(b_gen != ptr->gen)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", ptr->dev, bucket_nr, b_gen, bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_stale_dirty_ptr); goto out; } - if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type), - bch2_data_type_str(ptr_data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type)); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); goto out; } - if ((u64) *bucket_sectors + sectors > U32_MAX) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_sector_count_overflow, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" - "while marking %s", + if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - *bucket_sectors, sectors, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + *bucket_sectors, sectors); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_bucket_sector_count_overflow); sectors = -*bucket_sectors; + goto out; } *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; -err: -fsck_err: - bch2_dump_trans_updates(trans); - bch2_inconsistent_error(c); - ret = -BCH_ERR_bucket_ref_update; - goto out; } void bch2_trans_account_disk_usage_change(struct btree_trans *trans) @@ -590,11 +605,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (ret) goto err; - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { @@ -653,9 +666,9 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = data_type; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); @@ -674,26 +687,28 @@ err: return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - mutex_lock(&c->ec_stripes_heap_lock); + gc_stripe_lock(m); if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", + (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", - (u64) p.ec.idx, buf.buf); + __bch2_inconsistent_error(c, &buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_inconsistent_error(c); return -BCH_ERR_trigger_stripe_pointer; } m->block_sectors[p.ec.block] += sectors; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); @@ -719,16 +734,14 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; - struct disk_accounting_pos acc_replicas_key = { - .type = BCH_DISK_ACCOUNTING_replicas, - .replicas.data_type = data_type, - .replicas.nr_devs = 0, - .replicas.nr_required = 1, - }; + struct disk_accounting_pos acc_replicas_key; + memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); + acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; + acc_replicas_key.replicas.data_type = data_type; + acc_replicas_key.replicas.nr_devs = 0; + acc_replicas_key.replicas.nr_required = 1; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -762,13 +775,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -777,7 +790,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -791,45 +804,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -878,15 +880,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -895,12 +897,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -916,17 +915,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; + sectors[0] = -sectors[0]; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a9acdd6c0c86..c5363256e363 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - static inline void bucket_unlock(struct bucket *b) { BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); @@ -167,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 7174047b8e92..900b8680c8b5 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,6 +7,33 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + struct bucket { u8 lock; u8 gen_valid:1; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 46e9e32105a9..584f4a3eb670 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -11,6 +11,7 @@ #include "move.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-counters.h" #include "super-io.h" #include "thread_with_file.h" @@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.data_type = U8_MAX; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else { + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; + ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; + } return 0; } @@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage u; + bch2_dev_usage_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; @@ -404,10 +426,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, arg.replica_entries_bytes = replicas.nr; for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = i, - }; + struct disk_accounting_pos k; + disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), @@ -710,6 +730,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); case BCH_IOCTL_QUERY_ACCOUNTING: return bch2_ioctl_query_accounting(c, arg); + case BCH_IOCTL_QUERY_COUNTERS: + return bch2_ioctl_query_counters(c, arg); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 23a383577d4c..3726689093e3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c) return 0; } +#if 0 + +/* + * This seems to be duplicating code in cmd_remove_passphrase() in + * bcachefs-tools, but we might want to switch userspace to use this - and + * perhaps add an ioctl for calling this at runtime, so we can take the + * passphrase off of a mounted filesystem (which has come up). + */ int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; @@ -725,6 +733,10 @@ out: return ret; } +/* + * For enabling encryption on an existing filesystem: not hooked up yet, but it + * should be + */ int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; @@ -781,6 +793,7 @@ err: memzero_explicit(&key, sizeof(key)); return ret; } +#endif void bch2_fs_encryption_exit(struct bch_fs *c) { @@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c) crypto_free_shash(c->poly1305); if (c->chacha20) crypto_free_sync_skcipher(c->chacha20); - if (c->sha256) - crypto_free_shash(c->sha256); } int bch2_fs_encryption_init(struct bch_fs *c) @@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c) struct bch_key key; int ret = 0; - c->sha256 = crypto_alloc_shash("sha256", 0, 0); - ret = PTR_ERR_OR_ZERO(c->sha256); - if (ret) { - c->sha256 = NULL; - bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); - goto out; - } - crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 43b9d71f2f2b..4ac251c8fcd8 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); +#if 0 int bch2_disable_encryption(struct bch_fs *); int bch2_enable_encryption(struct bch_fs *, bool); +#endif void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..85fc90342492 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 642fbc60ecab..fe400dfc5d76 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,15 @@ #include "subvolume.h" #include "trace.h" +#include <linux/ioprio.h> + +static const char * const bch2_data_update_type_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_UPDATE_TYPES() +#undef x + NULL +}; + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -33,7 +42,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { + if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -91,7 +100,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static noinline void trace_move_extent_finish2(struct data_update *u, +static noinline void trace_io_move_finish2(struct data_update *u, struct bkey_i *new, struct bkey_i *insert) { @@ -111,11 +120,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_newline(&buf); - trace_move_extent_finish(c, buf.buf); + trace_io_move_finish(c, buf.buf); printbuf_exit(&buf); } -static void trace_move_extent_fail2(struct data_update *m, +static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, struct bkey_i *insert, @@ -126,7 +135,7 @@ static void trace_move_extent_fail2(struct data_update *m, struct printbuf buf = PRINTBUF; unsigned rewrites_found = 0; - if (!trace_move_extent_fail_enabled()) + if (!trace_io_move_fail_enabled()) return; prt_str(&buf, msg); @@ -166,7 +175,7 @@ static void trace_move_extent_fail2(struct data_update *m, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); } - trace_move_extent_fail(c, buf.buf); + trace_io_move_fail(c, buf.buf); printbuf_exit(&buf); } @@ -179,6 +188,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, container_of(op, struct data_update, op); struct keylist *keys = &op->insert_keys; struct bkey_buf _new, _insert; + struct printbuf journal_msg = PRINTBUF; int ret = 0; bch2_bkey_buf_init(&_new); @@ -214,7 +224,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, new = bkey_i_to_extent(bch2_keylist_front(keys)); if (!bch2_extents_match(k, old)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), NULL, "no match:"); goto nowork; } @@ -254,7 +264,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (m->data_opts.rewrite_ptrs && !rewrites_found && bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); goto nowork; } @@ -271,7 +281,7 @@ restart_drop_conflicting_replicas: } if (!bkey_val_u64s(&new->k)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); goto nowork; } @@ -352,7 +362,7 @@ restart_drop_extra_replicas: printbuf_exit(&buf); bch2_fatal_error(c); - ret = -EIO; + ret = -BCH_ERR_invalid_bkey; goto out; } @@ -370,7 +380,12 @@ restart_drop_extra_replicas: printbuf_exit(&buf); } - ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + printbuf_reset(&journal_msg); + prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); + + ret = bch2_trans_log_msg(trans, &journal_msg) ?: + bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: @@ -385,9 +400,9 @@ restart_drop_extra_replicas: if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - if (trace_move_extent_finish_enabled()) - trace_move_extent_finish2(m, &new->k_i, insert); + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -409,12 +424,13 @@ nowork: &m->stats->sectors_raced); } - count_event(c, move_extent_fail); + count_event(c, io_move_fail); bch2_btree_iter_advance(&iter); goto next; } out: + printbuf_exit(&journal_msg); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&_insert, c); bch2_bkey_buf_exit(&_new, c); @@ -427,14 +443,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } -void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc) +void bch2_data_update_read_done(struct data_update *m) { + m->read_done = true; + /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); - m->op.crc = crc; - m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + m->op.crc = m->rbio.pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); closure_call(&m->op.cl, bch2_write, NULL, NULL); } @@ -444,31 +463,34 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + kfree(update->bvecs); + update->bvecs = NULL; + if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); bkey_put_dev_refs(c, k); - bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + bch2_bkey_buf_exit(&update->k, c); } -static void bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; - struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; struct closure cl; struct btree_iter iter; struct bkey_s_c k; - int ret; + int ret = 0; closure_init_stack(&cl); bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - while (bio_sectors(bio)) { - unsigned sectors = bio_sectors(bio); + while (bpos_lt(update->op.pos, update->k.k->k.p)) { + unsigned sectors = update->k.k->k.p.offset - + update->op.pos.offset; bch2_trans_begin(trans); @@ -504,7 +526,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch_err_fn_ratelimited(c, ret); if (ret) - return; + break; sectors = min(sectors, wp->sectors_free); @@ -514,7 +536,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); bch2_alloc_sectors_done(c, wp); - bio_advance(bio, sectors << 9); update->op.pos.offset += sectors; extent_for_each_ptr(extent_i_to_s(e), ptr) @@ -533,13 +554,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_unlock(trans); closure_sync(&cl); } + + return ret; } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - printbuf_tabstop_push(out, 20); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); @@ -567,6 +591,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) { + prt_str(out, bch2_data_update_type_strs[m->type]); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); prt_newline(out); @@ -574,6 +601,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } +void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + prt_printf(out, "read_done:\t\%u\n", m->read_done); + bch2_write_op_to_text(out, &m->op); + printbuf_indent_sub(out, 2); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -617,12 +655,85 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + return -ENOMEM; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + kfree(m->bvecs); + m->bvecs = NULL; + return -ENOMEM; + } + + rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +} + +static int can_write_extent(struct bch_fs *c, struct data_update *m) +{ + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return -BCH_ERR_data_update_done_would_block; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu(c, i); + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + if (!nr_replicas) + return -BCH_ERR_data_update_done_no_rw_devs; + if (nr_replicas < m->op.nr_replicas) + return -BCH_ERR_insufficient_devices; + return 0; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts io_opts, + struct bch_io_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) @@ -640,36 +751,30 @@ int bch2_data_update_init(struct btree_trans *trans, * snapshots table - just skip it, we can move it later. */ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; - - if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done; - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - bkey_put_dev_refs(c, k); - return -BCH_ERR_nocow_lock_blocked; - } + return -BCH_ERR_data_update_done_no_snapshot; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); + m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc + ? BCH_DATA_UPDATE_copygc + : BCH_DATA_UPDATE_rebalance; m->btree_id = btree_id; m->data_opts = data_opts; m->ctxt = ctxt; m->stats = ctxt ? ctxt->stats : NULL; - bch2_write_op_init(&m->op, c, io_opts); + bch2_write_op_init(&m->op, c, *io_opts); m->op.pos = bkey_start_pos(k.k); m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + m->op.flags |= BCH_WRITE_pages_stable| + BCH_WRITE_pages_owned| + BCH_WRITE_data_encoded| + BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression; + m->op.compression_opt = io_opts->background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; @@ -707,7 +812,7 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* * If current extent durability is less than io_opts.data_replicas, @@ -740,28 +845,70 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); - goto out; + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); + if (!ret) + ret = -BCH_ERR_data_update_done_no_writes_needed; + goto out_bkey_buf_exit; } + /* + * Check if the allocation will succeed, to avoid getting an error later + * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless + * read: + * + * This guards against + * - BCH_WRITE_alloc_nowait allocations failing (promotes) + * - Destination target full + * - Device(s) in destination target offline + * - Insufficient durability available in destination target + * (i.e. trying to move a durability=2 replica to a target with a + * single durability=2 device) + */ + ret = can_write_extent(c, m); + if (ret) + goto out_bkey_buf_exit; + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto out; + goto out_bkey_buf_exit; + } + + if (!bkey_get_dev_refs(c, k)) { + ret = -BCH_ERR_data_update_done_no_dev_refs; + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto out_put_dev_refs; } if (bkey_extent_is_unwritten(k)) { - bch2_update_unwritten_extent(trans, m); - goto out; + ret = bch2_update_unwritten_extent(trans, m) ?: + -BCH_ERR_data_update_done_unwritten; + goto out_nocow_unlock; } + ret = bch2_data_update_bios_init(m, c, io_opts); + if (ret) + goto out_nocow_unlock; + return 0; -out: - bch2_data_update_exit(m); - return ret ?: -BCH_ERR_data_update_done; +out_nocow_unlock: + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); +out_put_dev_refs: + bkey_put_dev_refs(c, k); +out_put_disk_res: + bch2_disk_reservation_put(c, &m->op.res); +out_bkey_buf_exit: + bch2_bkey_buf_exit(&m->k, c); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index e4b50723428e..ed05125867da 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -4,6 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" +#include "io_read.h" #include "io_write_types.h" struct moving_context; @@ -15,27 +16,46 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_io_opts *, struct data_update_opts *); +#define BCH_DATA_UPDATE_TYPES() \ + x(copygc, 0) \ + x(rebalance, 1) \ + x(promote, 2) + +enum bch_data_update_types { +#define x(n, id) BCH_DATA_UPDATE_##n = id, + BCH_DATA_UPDATE_TYPES() +#undef x +}; + struct data_update { + enum bch_data_update_types type; /* extent being updated: */ + bool read_done; enum btree_id btree_id; struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; struct bch_move_stats *stats; + + struct bch_read_bio rbio; struct bch_write_op op; + struct bio_vec *bvecs; }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); +void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); -void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked); +void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, @@ -43,12 +63,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, struct bch_io_opts *, struct data_update_opts *); +int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, + struct bch_io_opts *); + void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts, struct data_update_opts, + struct bch_io_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 55333e82d1fe..788af88f6979 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, unsigned offset = 0; int ret; - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } @@ -844,8 +845,11 @@ restart: seqmutex_unlock(&c->btree_trans_lock); } -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); + +static ssize_t bch2_simple_print(struct file *file, char __user *buf, + size_t size, loff_t *ppos, + fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, i->ret = 0; if (!i->iter) { - btree_deadlock_to_text(&i->buf, c); + fn(&i->buf, c); i->iter++; } @@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, return ret ?: i->ret; } +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); +} + static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, @@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = { .read = bch2_btree_deadlock_read, }; +static ssize_t bch2_write_points_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); +} + +static const struct file_operations write_points_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_write_points_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 600eee936f13..d7f9f79318a2 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,6 +13,40 @@ #include <linux/dcache.h> +static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + *out_cf = (struct qstr) QSTR_INIT(NULL, 0); + +#ifdef CONFIG_UNICODE + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); + int ret = PTR_ERR_OR_ZERO(buf); + if (ret) + return ret; + + ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); + if (ret <= 0) + return ret; + + *out_cf = (struct qstr) QSTR_INIT(buf, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + if (likely(!info->cf_encoding)) { + *out_cf = *str; + return 0; + } else { + return bch2_casefold(trans, info, str, out_cf); + } +} + static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) @@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) #endif return bkey_bytes - - offsetof(struct bch_dirent, d_name) - + (d.v->d_casefold + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + : offsetof(struct bch_dirent, d_name)) - trailing_nuls; } struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); + } else { + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + } +} + +static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) +{ + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); + } else { + return (struct qstr) QSTR_INIT(NULL, 0); + } +} + +static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) +{ + return d.v->d_casefold + ? bch2_dirent_get_casefold_name(d) + : bch2_dirent_get_name(d); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(d); + struct qstr name = bch2_dirent_get_lookup_name(d); return bch2_dirent_hash(info, &name); } @@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); const struct qstr *r_name = _r; return !qstr_eq(l_name, *r_name); @@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_name(l); - const struct qstr r_name = bch2_dirent_get_name(r); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); + const struct qstr r_name = bch2_dirent_get_lookup_name(r); return !qstr_eq(l_name, r_name); } @@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned name_block_len = bch2_dirent_name_bytes(d); struct qstr d_name = bch2_dirent_get_name(d); + struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); int ret = 0; bkey_fsck_err_on(!d_name.len, c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, c, dirent_val_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + "dirent names exceed bkey size (%d + %d > %d)", + d_name.len, d_cf_name.len, name_block_len); /* * Check new keys don't exceed the max length @@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, dirent_to_itself, "dirent points to own directory"); + + if (d.v->d_casefold) { + bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && + d_cf_name.len > BCH_NAME_MAX, + c, dirent_cf_name_too_big, + "dirent w/ cf name too big (%u > %u)", + d_cf_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), + c, dirent_stray_data_after_cf_name, + "dirent has stray data after cf name's NUL"); + } fsck_err: return ret; } @@ -163,15 +236,14 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - subvol_inum dir, u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + int name_len, int cf_name_len, + u64 dst) { struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); BUG_ON(u64s > U8_MAX); @@ -190,14 +262,65 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } dirent->v.d_type = type; + dirent->v.d_unused = 0; + dirent->v.d_casefold = cf_name_len ? 1 : 0; - memcpy(dirent->v.d_name, name->name, name->len); - memset(dirent->v.d_name + name->len, 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); + return dirent; +} - EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); +static void dirent_init_regular_name(struct bkey_i_dirent *dirent, + const struct qstr *name) +{ + EBUG_ON(dirent->v.d_casefold); + + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); +} + +static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, + const struct qstr *name, + const struct qstr *cf_name) +{ + EBUG_ON(!dirent->v.d_casefold); + EBUG_ON(!cf_name->len); + + dirent->v.d_cf_name_block.d_name_len = name->len; + dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); + memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_name->len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + const struct qstr *name, + const struct qstr *cf_name, + u64 dst) +{ + struct bkey_i_dirent *dirent; + + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); + if (IS_ERR(dirent)) + return dirent; + + if (cf_name) + dirent_init_casefolded_name(dirent, name, cf_name); + else + dirent_init_regular_name(dirent, name); + + EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); return dirent; } @@ -213,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -233,16 +356,28 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, + u64 *i_size, enum btree_iter_update_trigger_flags flags) { struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir, type, name, dst_inum); + if (hash_info->cf_encoding) { + struct qstr cf_name; + ret = bch2_casefold(trans, hash_info, name, &cf_name); + if (ret) + return ret; + dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum); + } else { + dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum); + } + ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; + *i_size += bkey_bytes(&dirent->k); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; @@ -275,12 +410,13 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, + subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { + struct qstr src_name_lookup, dst_name_lookup; struct btree_iter src_iter = { NULL }; struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst = bkey_s_c_null; @@ -295,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ + ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); + if (ret) + goto out; old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, + src_hash, src_dir, &src_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) @@ -308,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; /* Lookup dst: */ + ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); + if (ret) + goto out; if (mode == BCH_RENAME) { /* * Note that we're _not_ checking if the target already exists - @@ -315,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans, * correctness: */ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name); + dst_hash, dst_dir, &dst_name_lookup); if (ret) goto out; } else { old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, + dst_hash, dst_dir, &dst_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) @@ -336,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -346,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + new_src = dirent_create_key(trans, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -406,6 +550,14 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + if (old_dst.k) + *dst_dir_i_size -= bkey_bytes(old_dst.k); + *src_dir_i_size -= bkey_bytes(old_src.k); + + if (mode == BCH_RENAME_EXCHANGE) + *src_dir_i_size += bkey_bytes(&new_src->k); + *dst_dir_i_size += bkey_bytes(&new_dst->k); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -465,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); + if (ret) + return ret; + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - int ret = bkey_err(k); + hash_info, dir, &lookup_name, flags); + ret = bkey_err(k); if (ret) goto err; @@ -572,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 362b3b2f2f2e..0880772b80a9 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -25,10 +25,13 @@ struct bch_inode_info; struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); -static inline unsigned dirent_val_u64s(unsigned len) +static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { - return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, - sizeof(u64)); + unsigned bytes = cf_len + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len + : offsetof(struct bch_dirent, d_name) + len; + + return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_dirent_read_target(struct btree_trans *, subvol_inum, @@ -47,7 +50,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, enum btree_iter_update_trigger_flags); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, + const struct qstr *, u64, u64 *, u64 *, enum btree_iter_update_trigger_flags); static inline unsigned vfs_d_type(unsigned type) @@ -62,8 +65,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, u64 *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -79,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h index 5e116b88e814..a46dbddd21aa 100644 --- a/fs/bcachefs/dirent_format.h +++ b/fs/bcachefs/dirent_format.h @@ -29,9 +29,25 @@ struct bch_dirent { * Copy of mode bits 12-15 from the target inode - so userspace can get * the filetype without having to do a stat() */ - __u8 d_type; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 d_type:5, + d_unused:2, + d_casefold:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 d_casefold:1, + d_unused:2, + d_type:5; +#endif - __u8 d_name[]; + union { + struct { + __u8 d_pad; + __le16 d_name_len; + __le16 d_cf_name_len; + __u8 d_names[]; + } d_cf_name_block __packed; + __DECLARE_FLEX_ARRAY(__u8, d_name); + } __packed; } __packed __aligned(8); #define DT_SUBVOL 16 diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b32e91ba8be8..a59f6c12529b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -114,10 +114,9 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans, unsigned dev, s64 sectors, bool gc) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_replicas_entry_cached(&acc.replicas, dev); return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); @@ -135,6 +134,12 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) +static const unsigned bch2_accounting_type_nr_counters[] = { +#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x +}; + int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -193,6 +198,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); + + bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + c, accounting_key_nr_counters_wrong, + "accounting key with %u counters, should be %u", + bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } @@ -635,7 +645,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", + "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { @@ -665,7 +675,7 @@ fsck_err: return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n %s", + "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), @@ -726,7 +736,9 @@ int bch2_accounting_read(struct bch_fs *c) break; if (!bch2_accounting_is_mem(acc_k)) { - struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; } @@ -882,15 +894,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + bch2_disk_accounting_mod2(trans, gc, + v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free) ?: (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); })); bch_err_fn(c, ret); @@ -917,7 +927,9 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; if (!bch2_accounting_is_mem(acc_k)) { - struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..abb1f6206fe9 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, struct bkey_s_c_accounting src) { - EBUG_ON(dst->k.u64s != src.k->u64s); - - for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + for (unsigned i = 0; + i < min(bch2_accounting_counters(&dst->k), + bch2_accounting_counters(src.k)); + i++) dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) dst->k.bversion = src.k->bversion; } @@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..8269af1dbe2a 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } +/* + * field 1: name + * field 2: id + * field 3: number of counters (max 3) + */ + #define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0) \ - x(persistent_reserved, 1) \ - x(replicas, 2) \ - x(dev_data_type, 3) \ - x(compression, 4) \ - x(snapshot, 5) \ - x(btree, 6) \ - x(rebalance_work, 7) \ - x(inum, 8) + x(nr_inodes, 0, 1) \ + x(persistent_reserved, 1, 1) \ + x(replicas, 2, 1) \ + x(dev_data_type, 3, 3) \ + x(compression, 4, 3) \ + x(snapshot, 5, 1) \ + x(btree, 6, 1) \ + x(rebalance_work, 7, 1) \ + x(inum, 8, 3) enum disk_accounting_type { -#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, +#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, BCH_DISK_ACCOUNTING_TYPES() #undef x BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +/* + * No subtypes - number of inodes in the entire filesystem + * + * XXX: perhaps we could add a per-subvolume counter? + */ +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +/* + * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the + * reservation: + */ +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +/* + * device, data type counter fields: + * [ + * nr_buckets + * live sectors (in buckets of that data type) + * sectors of internal fragmentation + * ] + * + * XXX: live sectors should've been done differently, you can have multiple data + * types in the same bucket (user, stripe, cached) and this collapses them to + * the bucket data type, and makes the internal fragmentation counter redundant + */ +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; +/* + * Compression type fields: + * [ + * number of extents + * uncompressed size + * compressed size + * ] + * + * Compression ratio, average extent size (fragmentation). + */ struct bch_acct_compression { __u8 type; }; +/* + * On disk usage by snapshot id; counts same values as replicas counter, but + * aggregated differently + */ struct bch_acct_snapshot { __u32 id; } __packed; @@ -137,10 +178,27 @@ struct bch_acct_btree { __u32 id; } __packed; +/* + * inum counter fields: + * [ + * number of extents + * sum of extent sizes - bkey size + * this field is similar to inode.bi_sectors, except here extents in + * different snapshots but the same inode number are all collapsed to the + * same counter + * sum of on disk size - same values tracked by replicas counters + * ] + * + * This tracks on disk fragmentation. + */ struct bch_acct_inum { __u64 inum; } __packed; +/* + * Simple counter of the amount of data (on disk sectors) rebalance needs to + * move, extents counted here are also in the rebalance_work btree. + */ struct bch_acct_rebalance_work { }; @@ -149,10 +207,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d2a5e76e6479..6faeda7ad03d 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -20,6 +20,7 @@ #include "io_read.h" #include "io_write.h" #include "keylist.h" +#include "lru.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -104,6 +105,7 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + u64 submit_time; struct bio bio; }; @@ -298,15 +300,27 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bpos bucket = PTR_BUCKET_POS(ca, ptr); if (flags & BTREE_TRIGGER_transactional) { + struct extent_ptr_decoded p = { + .ptr = *ptr, + .crc = bch2_extent_crc_unpack(s.k, NULL), + }; + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, + (const union bch_extent_entry *) ptr, &bp); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: + bch2_bucket_backpointer_mod(trans, s.s_c, &bp, + !(flags & BTREE_TRIGGER_overwrite)); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -BCH_ERR_mark_stripe; @@ -366,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -399,6 +400,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, (new_s->nr_blocks != old_s->nr_blocks || new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_transactional) { + int ret = bch2_lru_change(trans, + BCH_LRU_STRIPE_FRAGMENTATION, + idx, + stripe_lru_pos(old_s), + stripe_lru_pos(new_s)); + if (ret) + return ret; + } if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* @@ -443,9 +453,9 @@ int bch2_trigger_stripe(struct btree_trans *trans, if (new_s) { s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, new); int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) @@ -458,9 +468,9 @@ int bch2_trigger_stripe(struct btree_trans *trans, if (old_s) { s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, old); int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) @@ -472,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -726,14 +704,15 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); + } int stale = dev_ptr_stale(ca, ptr); if (stale) { @@ -796,6 +775,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; @@ -917,26 +897,6 @@ err: static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1009,180 +969,50 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } +/* + * XXX + * can we kill this and delete stripes from the trigger? + */ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -1294,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1360,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1380,8 +1210,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b if (bp_k.k->type != KEY_TYPE_backpointer) continue; + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + if (bp.v->btree_id == BTREE_ID_stripes) + continue; + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bkey_s_c_to_backpointer(bp_k), &last_flushed); + bp, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); @@ -1393,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1473,6 +1305,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1481,6 +1314,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1506,6 +1340,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } @@ -1527,6 +1362,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) @@ -1612,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) @@ -1968,39 +1805,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, return 0; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; - - if (may_create_new_stripe(c)) - return -1; + struct bch_fs *c = trans->c; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - stripe_idx = h->data[heap_idx].idx; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2052,24 +1890,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } @@ -2263,14 +2110,14 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ if (ret) return ret; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; s64 sectors = 0; for (unsigned i = 0; i < s->v.nr_blocks; i++) sectors -= stripe_blockcount_get(&s->v, i); + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); @@ -2284,6 +2131,8 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ sectors = -sectors; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); @@ -2367,46 +2216,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2477,15 +2287,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); @@ -2503,3 +2310,40 @@ int bch2_fs_ec_init(struct bch_fs *c) return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } + +static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, + struct bkey_s_c k, + struct bkey_buf *last_flushed) +{ + if (k.k->type != KEY_TYPE_stripe) + return 0; + + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + u64 lru_idx = stripe_lru_pos(s.v); + if (lru_idx) { + int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, + k.k->p.offset, lru_idx, k, last_flushed); + if (ret) + return ret; + } + return 0; +} + +int bch2_check_stripe_to_lru_refs(struct bch_fs *c) +{ + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 583ca6a226da..62d27e04d763 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s, memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); } +#define STRIPE_LRU_POS_EMPTY 1 + +static inline u64 stripe_lru_pos(const struct bch_stripe *s) +{ + if (!s) + return 0; + + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; + + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); + + /* Will be picked up by the stripe_delete worker */ + if (blocks_empty == nr_data) + return STRIPE_LRU_POS_EMPTY; + + if (!blocks_empty) + return 0; + + /* invert: more blocks empty = reuse first */ + return LRU_TIME_MAX - blocks_empty; +} + static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, const struct bch_extent_ptr *data_ptr, unsigned sectors) @@ -132,6 +155,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, m->sectors); } +static inline void gc_stripe_unlock(struct gc_stripe *s) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); +} + +static inline void gc_stripe_lock(struct gc_stripe *s) +{ + wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + struct bch_read_bio; struct ec_stripe_buf { @@ -212,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); @@ -221,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); - void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); @@ -261,11 +294,12 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); +int bch2_check_stripe_to_lru_refs(struct bch_fs *); + #endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 8d1e70e830ac..06144bfd9c19 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -20,23 +20,15 @@ struct stripe { }; struct gc_stripe { + u8 lock; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 sectors; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; struct bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 4590cd0c7c90..c8696f01eb14 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,8 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, injected) \ + x(BCH_ERR_injected, injected_fs_start) \ x(EINVAL, mount_option) \ x(BCH_ERR_mount_option, option_name) \ x(BCH_ERR_mount_option, option_value) \ @@ -116,9 +118,11 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOENT, btree_node_dying) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -180,6 +184,12 @@ x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ + x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ + x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -200,6 +210,8 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -210,10 +222,18 @@ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ - x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ - x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ + x(BCH_ERR_journal_res_blocked, journal_max_open) \ + x(BCH_ERR_journal_res_blocked, journal_full) \ + x(BCH_ERR_journal_res_blocked, journal_pin_full) \ + x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ + x(BCH_ERR_journal_res_blocked, journal_stuck) \ + x(BCH_ERR_journal_res_blocked, journal_retry_open) \ + x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ @@ -223,6 +243,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_offset) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ @@ -250,6 +271,7 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -258,17 +280,53 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ - x(EIO, no_device_to_read_from) \ + x(EIO, extent_poisened) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ + x(EIO, device_offline) \ + x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, no_devices_valid) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..d4dfd13a8076 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,15 +3,24 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 -bool bch2_inconsistent_error(struct bch_fs *c) +void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + prt_printf(out, bch2_log_msg(c, "")); +#endif +} + +bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) { set_bit(BCH_FS_error, &c->flags); @@ -21,10 +30,11 @@ bool bch2_inconsistent_error(struct bch_fs *c) case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", - journal_cur_seq(&c->journal)); + prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: + bch2_print_string_as_lines(KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -32,11 +42,63 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -int bch2_topology_error(struct bch_fs *c) +bool bch2_inconsistent_error(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + printbuf_indent_add_nextline(&buf, 2); + + bool ret = __bch2_inconsistent_error(c, &buf); + if (ret) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + +__printf(3, 0) +static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, + const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + + bch2_log_msg_start(c, &buf); + + prt_vprintf(&buf, fmt, args); + prt_newline(&buf); + + if (trans) + bch2_trans_updates_to_text(&buf, trans); + bool ret = __bch2_inconsistent_error(c, &buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + +bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); + va_end(args); + return ret; +} + +bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); + va_end(args); + return ret; +} + +int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) +{ + prt_printf(out, "btree topology error: "); + set_bit(BCH_FS_topology_error, &c->flags); if (!test_bit(BCH_FS_recovery_running, &c->flags)) { - bch2_inconsistent_error(c); + __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: @@ -44,6 +106,24 @@ int bch2_topology_error(struct bch_fs *c) } } +int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + + bch2_log_msg_start(c, &buf); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + int ret = __bch2_topology_error(c, &buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) @@ -54,25 +134,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { @@ -168,7 +264,8 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) #endif -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, + enum bch_sb_error_id id) { struct fsck_err_state *s; @@ -176,7 +273,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->fmt == fmt) { + if (s->id == id) { /* * move it to the head of the list: repeated fsck errors * are common @@ -194,7 +291,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) } INIT_LIST_HEAD(&s->list); - s->fmt = fmt; + s->id = id; list_add(&s->list, &c->fsck_error_msgs); return s; } @@ -244,15 +341,59 @@ static int do_fsck_ask_yn(struct bch_fs *c, return ask; } +static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, + enum bch_sb_error_id id, const char *msg, + bool *repeat, bool *print, bool *suppress) +{ + bch2_sb_error_count(c, id); + + struct fsck_err_state *s = fsck_err_get(c, id); + if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(msg, s->last_msg)) { + *repeat = true; + *print = false; + return s; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(msg, GFP_KERNEL); + + if (c->opts.ratelimit_errors && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + *suppress = true; + else + *print = false; + } + + s->nr++; + } + return s; +} + +void __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, const char *msg, + bool *repeat, bool *print, bool *suppress) +{ + bch2_sb_error_count(c, id); + + mutex_lock(&c->fsck_error_msgs_lock); + count_fsck_err_locked(c, id, msg, repeat, print, suppress); + mutex_unlock(&c->fsck_error_msgs_lock); +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, enum bch_sb_error_id err, const char *fmt, ...) { - struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; @@ -287,7 +428,12 @@ int __bch2_fsck_err(struct bch_fs *c, ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; - bch2_sb_error_count(c, err); + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + if (strncmp(fmt, "bcachefs", 8)) + prt_printf(out, bch2_log_msg(c, "")); +#endif va_start(args, fmt); prt_vprintf(out, fmt, args); @@ -307,42 +453,15 @@ int __bch2_fsck_err(struct bch_fs *c, } mutex_lock(&c->fsck_error_msgs_lock); - s = fsck_err_get(c, fmt); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { - ret = s->ret; - goto err_unlock; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(buf.buf, GFP_KERNEL); - if (!s->last_msg) { - ret = -ENOMEM; - goto err_unlock; - } - - if (c->opts.ratelimit_errors && - !(flags & FSCK_NO_RATELIMIT) && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - suppressing = true; - else - print = false; - } - - s->nr++; + bool repeat = false, print = true, suppress = false; + bool inconsistent = false, exiting = false; + struct fsck_err_state *s = + count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); + if (repeat) { + ret = s->ret; + goto err_unlock; } -#ifdef BCACHEFS_LOG_PREFIX - if (!strncmp(fmt, "bcachefs:", 9)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) { @@ -361,6 +480,7 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); inconsistent = true; + print = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); @@ -419,24 +539,30 @@ int __bch2_fsck_err(struct bch_fs *c, print = true; } print: + prt_newline(out); + + if (inconsistent) + __bch2_inconsistent_error(c, out); + else if (exiting) + prt_printf(out, "Unable to continue, halting\n"); + else if (suppress) + prt_printf(out, "Ratelimiting new instances of previous error\n"); + if (print) { + /* possibly strip an empty line, from printbuf_indent_add */ + while (out->pos && out->buf[out->pos - 1] == ' ') + --out->pos; + printbuf_nul_terminate(out); + if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s\n", out->buf); + bch2_print(c, "%s", out->buf); else bch2_print_string_as_lines(KERN_ERR, out->buf); } - if (exiting) - bch_err(c, "Unable to continue, halting"); - else if (suppressing) - bch_err(c, "Ratelimiting new instances of previous error"); - if (s) s->ret = ret; - if (inconsistent) - bch2_inconsistent_error(c); - /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type @@ -498,16 +624,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, prt_printf(&buf, " level=%u: ", from.level); bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\n "); + prt_newline(&buf); va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); - prt_str(&buf, ": delete?"); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); printbuf_exit(&buf); return ret; } @@ -520,7 +644,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c) list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { if (s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); list_del(&s->list); kfree(s->last_msg); @@ -530,35 +654,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } -int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) { u32 restart_count = trans->restart_count; int ret = 0; - /* XXX: we don't yet attempt to print paths when we don't know the subvol */ - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } if (!inum.subvol || ret) prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + prt_printf(out, " offset %llu: ", offset); return trans_was_restarted(trans, restart_count); } -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) { - int ret = bch2_inum_err_msg_trans(trans, out, inum); - prt_printf(out, " offset %llu: ", offset); - return ret; + bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } -void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); + struct bch_fs *c = trans->c; + int ret = 0; + + if (!bch2_snapshot_is_leaf(c, pos.snapshot)) + prt_str(out, "(multiple snapshots) "); + + subvol_inum inum = { + .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), + .inum = pos.inode, + }; + + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } + + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + + prt_printf(out, " offset %llu: ", pos.offset << 8); + return 0; } -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7acf2a27ca28..d0d024dc714b 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,6 +18,8 @@ struct work_struct; /* Error messages: */ +void bch2_log_msg_start(struct bch_fs *, struct printbuf *); + /* * Inconsistency errors: The on disk data is inconsistent. If these occur during * initial recovery, they don't indicate a bug in the running code - we walk all @@ -29,21 +31,10 @@ struct work_struct; * BCH_ON_ERROR_CONTINUE mode */ +bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); bool bch2_inconsistent_error(struct bch_fs *); - -int bch2_topology_error(struct bch_fs *); - -#define bch2_fs_topology_error(c, ...) \ -({ \ - bch_err(c, "btree topology error: " __VA_ARGS__); \ - bch2_topology_error(c); \ -}) - -#define bch2_fs_inconsistent(c, ...) \ -({ \ - bch_err(c, __VA_ARGS__); \ - bch2_inconsistent_error(c); \ -}) +__printf(2, 3) +bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); #define bch2_fs_inconsistent_on(cond, ...) \ ({ \ @@ -53,26 +44,21 @@ int bch2_topology_error(struct bch_fs *); _ret; \ }) -/* - * When a transaction update discovers or is causing a fs inconsistency, it's - * helpful to also dump the pending updates: - */ -#define bch2_trans_inconsistent(trans, ...) \ -({ \ - bch_err(trans->c, __VA_ARGS__); \ - bch2_dump_trans_updates(trans); \ - bch2_inconsistent_error(trans->c); \ -}) +__printf(2, 3) +bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); -#define bch2_trans_inconsistent_on(cond, trans, ...) \ +#define bch2_trans_inconsistent_on(cond, ...) \ ({ \ bool _ret = unlikely(!!(cond)); \ - \ if (_ret) \ - bch2_trans_inconsistent(trans, __VA_ARGS__); \ + bch2_trans_inconsistent(__VA_ARGS__); \ _ret; \ }) +int __bch2_topology_error(struct bch_fs *, struct printbuf *); +__printf(2, 3) +int bch2_fs_topology_error(struct bch_fs *, const char *, ...); + /* * Fsck errors: inconsistency errors we detect at mount time, and should ideally * be able to repair: @@ -80,7 +66,7 @@ int bch2_topology_error(struct bch_fs *); struct fsck_err_state { struct list_head list; - const char *fmt; + enum bch_sb_error_id id; u64 nr; bool ratelimited; int ret; @@ -90,6 +76,12 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) +void __bch2_count_fsck_err(struct bch_fs *, + enum bch_sb_error_id, const char *, + bool *, bool *, bool *); +#define bch2_count_fsck_err(_c, _err, ...) \ + __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) + __printf(5, 6) __cold int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, @@ -216,32 +208,43 @@ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +static inline void bch2_account_io_success_fail(struct bch_dev *ca, + enum bch_member_error_type type, + bool success) +{ + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { + bch2_io_error(ca, type); + } +} + +static inline void bch2_account_io_completion(struct bch_dev *ca, + enum bch_member_error_type type, + u64 submit_time, bool success) +{ + if (unlikely(!ca)) + return; + + if (type != BCH_MEMBER_ERROR_checksum) + bch2_latency_acct(ca, submit_time, type); + + bch2_account_io_success_fail(ca, type, success); +} -#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) - -int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); -void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 2d8042f853dc..ae7c7a177e10 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -28,6 +28,13 @@ #include "trace.h" #include "util.h" +static const char * const bch2_extent_flags_strs[] = { +#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, + BCH_EXTENT_FLAGS() +#undef x + NULL, +}; + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -59,53 +67,57 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - /* Pick at random, biased in favor of the faster device: */ + if (unlikely(bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - return bch2_get_random_u64_below(l1 + l2) > l1; - } + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; + + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; - if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; + /* Pick at random, biased in favor of the faster device: */ - return p1.idx < p2.idx; + return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -115,64 +127,111 @@ static inline bool ptr_better(struct bch_fs *c, */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_io_failures *failed, - struct extent_ptr_decoded *pick) + struct extent_ptr_decoded *pick, + int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return -BCH_ERR_extent_poisened; + rcu_read_lock(); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; + /* Are we being asked to read from a specific device? */ + if (dev >= 0 && p.ptr.dev != dev) + continue; struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) - p.idx++; + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } - if (p.idx > (unsigned) p.has_ec) - continue; + have_missing_devs |= ca && !bch2_dev_is_online(ca); - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + if (!ca || !bch2_dev_is_online(ca)) { + if (!p.has_ec) + continue; + p.do_ec_reconstruct = true; + } + + if (bch2_force_reconstruct_read && p.has_ec) + p.do_ec_reconstruct = true; - *pick = p; - ret = 1; + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; + + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - return ret; + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; + + /* + * If we get here, we have pointers (bkey_ptrs_validate() ensures that), + * but they don't point to valid devices: + */ + return -BCH_ERR_no_devices_valid; } /* KEY_TYPE_btree_ptr: */ @@ -536,29 +595,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); @@ -997,7 +1062,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, @@ -1220,6 +1285,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; + case BCH_EXTENT_ENTRY_flags: + prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); + break; + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1381,6 +1450,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, #endif break; } + case BCH_EXTENT_ENTRY_flags: + bkey_fsck_err_on(entry != ptrs.start, + c, extent_flags_not_at_start, + "extent flags entry not at start"); + break; } } @@ -1447,6 +1521,28 @@ void bch2_ptr_swab(struct bkey_s k) } } +int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) +{ + int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); + if (ret) + return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { + ptrs.start->flags.flags = flags; + } else { + struct bch_extent_flags f = { + .type = BIT(BCH_EXTENT_ENTRY_flags), + .flags = flags, + }; + __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1492,8 +1588,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - break; case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_flags: break; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 204d765dd74c..e78a39e7e18f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ - (_ptr).has_ec = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -401,10 +402,10 @@ out: \ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ @@ -753,4 +754,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } +static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) +{ + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) + return ptrs.start->flags.flags; + return 0; +} + +static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) +{ + return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); +} + +int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index c198dfc376d6..74c0252cbd98 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -79,8 +79,9 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 + x(rebalance, 5) \ + x(flags, 6) +#define BCH_EXTENT_ENTRY_MAX 7 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { #endif }; +#define BCH_EXTENT_FLAGS() \ + x(poisoned, 0) + +enum bch_extent_flags_e { +#define x(n, v) BCH_EXTENT_FLAG_##n = v, + BCH_EXTENT_FLAGS() +#undef x +}; + +struct bch_extent_flags { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + flags:57; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 flags:57, + type:7; +#endif +}; + /* bch_extent_rebalance: */ #include "rebalance_format.h" diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 43d6c341ecca..e51529dca4c2 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; @@ -31,10 +32,10 @@ struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:6, + failed_io:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 2eaffe37b5e7..0e742555cb0a 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr return cmp(a, b, priv); } -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, +static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, cmp_func, priv); } -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, +static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + do_swap(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, size, swap_func, priv); } -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +static void eytzinger1_sort_r(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) { - int i, j, k; + unsigned i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { - if (is_aligned(base, size, 8)) + if (is_aligned(base1, size, 8)) swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) + else if (is_aligned(base1, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { + for (i = n / 2; i >= 1; --i) { /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = i; k = j * 2, k < n;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == n) - j = j * 2 + 1; + if (j * 2 == n) + j *= 2; /* Backtrack to the correct location. */ - while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) - j = (j - 1) / 2; + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + for (i = n; i > 1; --i) { + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); /* Find the sift-down path all the way to the leaves. */ - for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = 1; k = j * 2, k + 1 < i;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == i) - j = j * 2 + 1; + if (j * 2 + 1 == i) + j *= 2; /* Backtrack to the correct location. */ - while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) - j = (j - 1) / 2; + while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ - for (k = j; j;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + for (k = j; j > 1;) { + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } } +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + void *base1 = base - size; + + return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); +} + void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 0541192d7bc0..643c1f716061 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -6,6 +6,7 @@ #include <linux/log2.h> #ifdef EYTZINGER_DEBUG +#include <linux/bug.h> #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) #else #define EYTZINGER_BUG_ON(cond) @@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) return rounddown_pow_of_two(size + 1) - 1; } -/* - * eytzinger1_next() and eytzinger1_prev() have the nice properties that - * - * eytzinger1_next(0) == eytzinger1_first()) - * eytzinger1_prev(0) == eytzinger1_last()) - * - * eytzinger1_prev(eytzinger1_first()) == 0 - * eytzinger1_next(eytzinger1_last()) == 0 - */ - static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i >>= i > size; } else { i >>= ffz(i) + 1; @@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i -= 1; i >>= i > size; } else { @@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) +#define eytzinger0_for_each_prev(_i, _size) \ + for (unsigned (_i) = eytzinger0_last((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_prev((_i), (_size))) + /* return greatest node <= @search, or -1 if not found */ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - unsigned i, n = 0; - - if (!nr) - return -1; - - do { - i = n; - n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); - } while (n < nr); - - if (n & 1) { - /* - * @i was greater than @search, return previous node: - * - * if @i was leftmost/smallest element, - * eytzinger0_prev(eytzinger0_first())) returns -1, as expected - */ - return eytzinger0_prev(i, nr); - } else { - return i; - } + void *base1 = base - size; + unsigned n = 1; + + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n) + 1; + return n - 1; } +/* return smallest node > @search, or -1 if not found */ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + void *base1 = base - size; + unsigned n = 1; - /* - * if eytitzinger0_find_le() returned -1 - no element was <= search - we - * want to return the first element; next/prev identities mean this work - * as expected - * - * similarly if find_le() returns last element, we should return -1; - * identities mean this all works out: - */ - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } +/* return smallest node >= @search, or -1 if not found */ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - - if (idx < nr && !cmp(base + idx * size, search)) - return idx; + void *base1 = base - size; + unsigned n = 1; - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ + size_t _size = (size); \ + void *_base1 = (void *)(base) - _size; \ const void *_search = (search); \ size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + size_t _i = 1; \ int _res; \ \ - while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - _i; \ + while (_i <= _nr && \ + (_res = _cmp(_search, _base1 + _i * _size))) \ + _i = eytzinger1_child(_i, _res > 0); \ + _i - 1; \ }) void eytzinger0_sort_r(void *, size_t, size_t, diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ab1d5db2fa56..a03e2c780cba 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, if (!get_more) break; + unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); + + if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) + break; + + unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; @@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); @@ -211,17 +219,17 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); + swap(rbio->bio.bi_iter.bi_size, bytes); - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; - swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); err: if (ret && @@ -232,7 +240,8 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); @@ -280,12 +289,13 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), @@ -323,10 +333,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); @@ -420,7 +430,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 2089c36b5866..535bc5fcbcc0 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + bool split = false; size_t shorten; ssize_t ret; @@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_read_bioset); - bio->bi_end_io = bch2_direct_IO_read_endio; - dio = container_of(bio, struct dio_read, rbio.bio); closure_init(&dio->cl, NULL); @@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { + split = true; + bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), REQ_OP_READ, GFP_KERNEL, &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; start: bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; @@ -160,7 +160,15 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + struct bch_read_bio *rbio = + rbio_init(bio, + c, + opts, + split + ? bch2_direct_IO_read_split_endio + : bch2_direct_IO_read_endio); + + bch2_read(c, rbio, inode_inum(inode)); } blk_finish_plug(&plug); @@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + dio->op.flags |= BCH_WRITE_sync; + dio->op.flags |= BCH_WRITE_check_enospc; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, bio_sectors(bio), true); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 717e7b94c66f..c80ed3a54e70 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -999,17 +999,28 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + if (k.k->p.inode != inode->v.i_ino || + !bkey_extent_is_data(k.k)) { + loff_t start_offset = k.k->p.inode == inode->v.i_ino + ? max(offset, bkey_start_offset(k.k) << 9) + : offset; + loff_t end_offset = k.k->p.inode == inode->v.i_ino + ? MAX_LFS_FILESIZE + : k.k->p.offset << 9; + + /* + * Found a hole in the btree, now make sure it's + * a hole in the pagecache. We might have to + * keep searching if this hole is entirely dirty + * in the page cache: + */ + bch2_trans_unlock(trans); + loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, + start_offset, end_offset, 0, false); + if (pagecache_hole < end_offset) { + next_hole = pagecache_hole; break; + } } else { offset = max(offset, bkey_start_offset(k.k) << 9); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 595b57fabc9a..c1553e44e049 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include <linux/compat.h> @@ -54,6 +54,32 @@ static int bch2_inode_flags_set(struct btree_trans *trans, (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; + if ((newflags ^ oldflags) & BCH_INODE_casefolded) { +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inode_inum(inode)); + if (ret < 0) + return ret; + + ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); + if (ret) + return ret; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); +#else + printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); + return -EOPNOTSUPP; +#endif + } + if (s->set_projinherit) { bi->bi_fields_set &= ~(1 << Inode_opt_project); bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); @@ -218,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; @@ -511,10 +537,12 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -EXDEV; goto err; } - ret = __bch2_unlink(dir, victim, true); + + ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: + __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); - d_delete(victim); + d_invalidate(victim); } err: inode_unlock(dir); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index d30f9bb056fd..ecd3bfdcde21 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -6,19 +6,21 @@ /* bcachefs inode flags -> vfs inode flags: */ static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_casefolded] = S_CASEFOLD, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b2669d7ffec5..fc834bdf1f52 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" @@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + c, "dirent to missing inode:\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -698,6 +700,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; +#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(vdir)) { + /* + * Do not cache a negative dentry in casefolded directories + * as it would need to be invalidated in the following situation: + * - Lookup file "blAH" in a casefolded directory + * - Creation of file "BLAH" in a casefolded directory + * - Lookup file "blAH" in a casefolded directory + * which would fail if we had a negative dentry. + * + * We should come back to this when VFS has a method to handle + * this edgecase. + */ + return NULL; + } +#endif + return d_splice_alias(&inode->v, dentry); } @@ -1802,7 +1821,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, break; } - mapping_set_large_folios(inode->v.i_mapping); + mapping_set_folio_min_order(inode->v.i_mapping, + get_order(trans->c->opts.block_size)); } static void bch2_free_inode(struct inode *vinode) @@ -2008,44 +2028,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; @@ -2192,17 +2174,21 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err; + if (opt_defined(opts, discard)) + set_bit(BCH_FS_discard_mount_opt_set, &c->flags); + /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); + ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); if (ret) goto err_stop_fs; bch2_opts_apply(&c->opts, opts); - ret = bch2_fs_start(c); - if (ret) - goto err_stop_fs; + /* + * need to initialise sb and set c->vfs_sb _before_ starting fs, + * for blk_holder_ops + */ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); @@ -2264,6 +2250,10 @@ got_sb: sb->s_shrink->seeks = 0; + ret = bch2_fs_start(c); + if (ret) + goto err_put_super; + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); @@ -2300,7 +2290,8 @@ err_stop_fs: goto err; err_put_super: - __bch2_fs_stop(c); + if (!sb->s_root) + __bch2_fs_stop(c); deactivate_locked_super(sb); goto err; } @@ -2343,6 +2334,8 @@ static int bch2_fs_parse_param(struct fs_context *fc, int ret = bch2_parse_one_mount_opt(c, &opts->opts, &opts->parse_later, param->key, param->string); + if (ret) + pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); return bch2_err_class(ret); } @@ -2351,8 +2344,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; + + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); - return bch2_remount(sb, &fc->sb_flags, opts->opts); + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0e85131d0af8..52320295dcf6 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -23,13 +23,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, struct bch_inode_unpacked *inode) { @@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1477,14 +1421,14 @@ static int check_key_has_inode(struct btree_trans *trans, if (fsck_err_on(!i, trans, key_in_missing_inode, - "key in missing inode:\n %s", + "key in missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), trans, key_in_wrong_inode_type, - "key for wrong inode mode %o:\n %s", + "key for wrong inode mode %o:\n%s", i->inode.bi_mode, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1627,13 +1571,13 @@ static int overlapping_extents_found(struct btree_trans *trans, if (ret) goto err; - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k1); if (!bpos_eq(pos1, k1.k->p)) { - prt_str(&buf, "\n wanted\n "); + prt_str(&buf, "\nwanted\n "); bch2_bpos_to_text(&buf, pos1); - prt_str(&buf, "\n "); + prt_str(&buf, "\n"); bch2_bkey_to_text(&buf, &pos2); bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", @@ -1656,7 +1600,7 @@ static int overlapping_extents_found(struct btree_trans *trans, break; } - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k2); if (bpos_gt(k2.k->p, pos2.p) || @@ -1667,7 +1611,7 @@ static int overlapping_extents_found(struct btree_trans *trans, goto err; } - prt_printf(&buf, "\n overwriting %s extent", + prt_printf(&buf, "\noverwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); if (fsck_err(trans, extent_overlapping, @@ -1688,6 +1632,8 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); + bch_info(c, "repair ret %s", bch2_err_str(ret)); + if (ret) goto err; @@ -1840,7 +1786,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, - "extent type past end of inode %llu:%u, i_size %llu\n %s", + "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; @@ -1985,169 +1931,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2247,7 +2030,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } @@ -2291,7 +2074,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2378,13 +2161,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } @@ -3240,7 +3023,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); @@ -3348,7 +3131,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 339b80770f1d..80051073f613 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } @@ -868,19 +867,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { @@ -1092,7 +1078,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1256,7 +1242,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 428b9be6af34..f82cfbf460d0 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index b99a5bf1a75e..117110af1e3f 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -137,7 +137,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(casefolded, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 5353979117b0..6b842c8d21be 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -115,7 +115,8 @@ err: bch2_increment_clock(c, sectors_allocated, WRITE); if (should_print_err(ret)) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index aa91fcf51eec..fd01e67b3e84 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -25,8 +25,15 @@ #include "subvolume.h" #include "trace.h" +#include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -80,6 +87,7 @@ struct promote_op { struct rhash_head hash; struct bpos pos; + struct work_struct work; struct data_update write; struct bio_vec bi_inline_vecs[]; /* must be last */ }; @@ -96,6 +104,33 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) +{ + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) + return false; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev && + u->data_opts.rewrite_ptrs & BIT(i)) + return true; + i++; + } + + return false; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, @@ -105,7 +140,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) + if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) @@ -125,98 +160,93 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void promote_free(struct bch_fs *c, struct promote_op *op) +static noinline void promote_free(struct bch_read_bio *rbio) { - int ret; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + struct bch_fs *c = rbio->c; + + int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); bch2_data_update_exit(&op->write); - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; + struct promote_op *op = container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.rbio.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); + promote_free(&op->write.rbio); } -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +static void promote_start_work(struct work_struct *work) { - struct bio *bio = &op->write.op.wbio.bio; + struct promote_op *op = container_of(work, struct promote_op, work); - trace_and_count(op->write.op.c, read_promote, &rbio->bio); + bch2_data_update_read_done(&op->write); +} - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); +static noinline void promote_start(struct bch_read_bio *rbio) +{ + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - bch2_data_update_read_done(&op->write, rbio->pick.crc); + INIT_WORK(&op->work, promote_start_work); + queue_work(rbio->c->write_ref_wq, &op->work); } -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio, - struct bch_io_failures *failed) +static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + unsigned sectors, + struct bch_read_bio *orig, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); - if (!op) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } + if (!have_io_error(failed)) { + update_opts.target = orig->opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags |= BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; + } else { + update_opts.target = orig->opts.foreground_target; - op->start_time = local_clock(); - op->pos = pos; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev) && + !ptr_being_rewritten(orig, ptr->dev)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_KERNEL); - if (!*rbio) { - ret = -BCH_ERR_nopromote_enomem; - goto err; + if (!update_opts.rewrite_ptrs) + return NULL; } - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { ret = -BCH_ERR_nopromote_enomem; - goto err; + goto err_put; } - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; + op->start_time = local_clock(); + op->pos = pos; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { @@ -224,64 +254,44 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, goto err; } - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - struct data_update_opts update_opts = {}; - - if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; - } else { - update_opts.target = opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } - ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - opts, + &orig->opts, update_opts, btree_id, k); + op->write.type = BCH_DATA_UPDATE_promote; /* * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ - if (ret) { - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); - goto err; - } + if (ret) + goto err_remove_hash; + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; + op->write.rbio.promote = true; op->write.op.end_io = promote_done; - return op; + return &op->write.rbio; +err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; + bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); +err_put: bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, +static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned flags, - struct bch_read_bio **rbio, + struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed) @@ -301,18 +311,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags, failed); + ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio, failed); + struct bch_read_bio *promote = + __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, sectors, orig, failed); + if (!promote) + return NULL; + ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -321,7 +334,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, *read_full = promote_full; return promote; nopromote: - trace_read_nopromote(c, ret); + trace_io_read_nopromote(c, ret); return NULL; } @@ -330,9 +343,17 @@ nopromote: static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9); + int ret = lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, @@ -341,10 +362,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -375,20 +392,25 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); + percpu_ref_put(&ca->io_ref); + } if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->kmalloc) - kfree(rbio); - else + if (unlikely(rbio->promote)) { + if (!rbio->bio.bi_status) + promote_start(rbio); + else + promote_free(rbio); + } else { + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + } rbio = parent; } @@ -408,61 +430,49 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) +static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - - bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_slots); + struct data_update *u = container_of(rbio, struct data_update, rbio); retry: bch2_trans_begin(trans); - rbio->bio.bi_status = 0; - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + u->btree_id, bkey_start_pos(&u->k.k->k), + 0))); if (ret) goto err; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; + rbio->ret = -BCH_ERR_data_read_key_overwritten; + goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - return; + bkey_start_pos(&u->k.k->k), + u->btree_id, + bkey_i_to_s_c(u->k.k), + 0, failed, flags, -1); err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + goto retry; + + if (ret) { + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } + + BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -477,45 +487,80 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + struct btree_trans *trans = bch2_trans_get(c); + + trace_io_read_retry(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], + bvec_iter_sectors(rbio->bvec_iter)); - trace_and_count(c, read_retry, &rbio->bio); + if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } - rbio->bio.bi_status = 0; + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; rbio = bch2_rbio_free(rbio); - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; + flags |= BCH_READ_in_retry; + flags &= ~BCH_READ_may_promote; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; + + int ret = rbio->data_update + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) + : __bch2_read(trans, rbio, iter, inum, &failed, flags); - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + struct printbuf buf = PRINTBUF; - __bch2_read(c, rbio, iter, inum, &failed, flags); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->data_update) + prt_str(&buf, "(internal move) "); + prt_str(&buf, "successful retry"); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } + + bch2_rbio_done(rbio); + bch2_trans_put(trans); } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - rbio->retry = retry; + BUG_ON(ret >= 0); + + rbio->ret = ret; + rbio->bio.bi_status = blk_error; - if (rbio->flags & BCH_READ_IN_RETRY) + bch2_rbio_parent(rbio)->saw_error = true; + + if (rbio->flags & BCH_READ_in_retry) return; - if (retry == READ_ERR) { + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } else { rbio = bch2_rbio_free(rbio); - rbio->bio.bi_status = error; + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); } } @@ -531,15 +576,13 @@ static void bch2_read_io_err(struct work_struct *work) bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_read); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -621,14 +664,12 @@ static void bch2_read_csum_err(struct work_struct *work) bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + if (ca) bch_err_ratelimited(ca, "%s", buf.buf); - } else { + else bch_err_ratelimited(c, "%s", buf.buf); - } - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -648,7 +689,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -668,7 +709,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -678,9 +719,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -698,8 +741,26 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; + + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, + BLK_STS_IOERR); + goto out; + } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) goto csum_err; /* @@ -712,32 +773,40 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; + if (likely(!parent->data_update)) { + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + } else { + if (rbio->split) + rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -754,12 +823,9 @@ static void __bch2_read_endio(struct work_struct *work) ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } @@ -767,17 +833,6 @@ out: memalloc_nofs_restore(nofs_flags); return; csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: @@ -797,10 +852,8 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; @@ -810,14 +863,14 @@ static void bch2_read_endio(struct bio *bio) return; } - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, read_reuse_race, &rbio->bio); + trace_and_count(c, io_read_reuse_race, &rbio->bio); - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -883,15 +936,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + struct data_update *u = rbio_data_update(orig); + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -902,19 +955,21 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); + this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], + bvec_iter_sectors(iter)); goto out_read_done; } retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -930,6 +985,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } @@ -941,56 +997,57 @@ retry_pick: * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ - if ((flags & BCH_READ_IN_RETRY) && + if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); + bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref); goto retry_pick; } - if (flags & BCH_READ_NODECODE) { + if (likely(!u)) { + if (!(flags & BCH_READ_last_fragment) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_must_clone; + + narrow_crcs = !(flags & BCH_READ_in_retry) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { + read_full = true; + bounce = true; + } + } else { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref); - goto hole; + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; - bounce = true; } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full, failed); + rbio = promote_alloc(trans, iter, k, &pick, flags, orig, + &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1009,7 +1066,7 @@ retry_pick: pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } -get_bio: + if (rbio) { /* * promote already allocated bounce rbio: @@ -1024,17 +1081,16 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(NULL, + rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { + } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1043,11 +1099,10 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); rbio->bio.bi_iter = iter; - rbio->split = true; } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1056,67 +1111,60 @@ get_bio: EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - rbio->c = c; rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else + if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; - rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_NODECODE) - orig->pick = pick; - rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); + trace_and_count(c, io_read_bounce, &rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + if (!u) + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + else + this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); + trace_and_count(c, io_read_split, &orig->bio); } /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - if (!(flags & BCH_READ_IN_RETRY)) + if (!(flags & BCH_READ_in_retry)) bch2_trans_unlock(trans); else bch2_trans_unlock_long(trans); - if (!rbio->pick.idx) { + if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); @@ -1126,7 +1174,9 @@ get_bio: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); goto out; } @@ -1135,10 +1185,10 @@ get_bio: bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); @@ -1152,15 +1202,16 @@ get_bio: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { bch2_trans_unlock(trans); @@ -1170,54 +1221,54 @@ out: rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; + if (flags & BCH_READ_in_retry) + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: + this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], + bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; + if (u) + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) +int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; - BUG_ON(flags & BCH_READ_NODECODE); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1267,24 +1318,27 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, - offset_into_extent, failed, flags); + offset_into_extent, failed, flags, -1); + swap(bvec_iter.bi_size, bytes); + if (ret) goto err; - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; - swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } @@ -1292,17 +1346,22 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); - prt_printf(&buf, "read error %i from btree lookup", ret); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); + prt_printf(&buf, "read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); + return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index a82e8a94ccb6..c78025d863e0 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "btree_iter.h" #include "reflink.h" struct bch_read_bio { @@ -35,19 +36,18 @@ struct bch_read_bio { u16 flags; union { struct { - u16 bounce:1, + u16 data_update:1, + promote:1, + bounce:1, split:1, - kmalloc:1, have_ioref:1, narrow_crcs:1, - hole:1, - retry:2, + saw_error:1, context:2; }; u16 _state; }; - - struct bch_devs_list devs_have; + s16 ret; struct extent_ptr_decoded pick; @@ -65,8 +65,6 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct promote_op *promote; - struct bch_io_opts opts; struct work_struct work; @@ -108,61 +106,89 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; } +#define BCH_READ_FLAGS() \ + x(retry_if_stale) \ + x(may_promote) \ + x(user_mapped) \ + x(last_fragment) \ + x(must_bounce) \ + x(must_clone) \ + x(in_retry) + +enum __bch_read_flags { +#define x(n) __BCH_READ_##n, + BCH_READ_FLAGS() +#undef x +}; + enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, +#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), + BCH_READ_FLAGS() +#undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); + struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); + int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags, -1); + /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ + WARN(ret, "unhandled error from __bch2_read_extent()"); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + bch2_trans_run(c, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped)); +} + +static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + struct bch_read_bio *orig) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; + return rbio; } static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) + struct bch_fs *c, + struct bch_io_opts opts, + bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; + rbio->start_time = local_clock(); + rbio->c = c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; return rbio; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 03892388832b..07b55839768e 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -34,6 +34,12 @@ #include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -374,7 +380,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -396,29 +402,36 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -} + struct printbuf buf = PRINTBUF; -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -{ - __bch2_write_op_error(out, op, op->pos.offset); -} + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + } else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); + } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) -{ - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + prt_str(&buf, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); + } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -493,7 +506,7 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) + if (!(op->flags & BCH_WRITE_move)) bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); @@ -516,7 +529,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -539,7 +552,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -548,7 +561,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -560,11 +573,8 @@ static void __bch2_write_index(struct bch_write_op *op) if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); } if (ret) @@ -573,21 +583,29 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -601,7 +619,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; @@ -615,8 +633,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_SUBMITTED) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -654,11 +672,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -676,13 +694,17 @@ static void bch2_write_endio(struct bio *bio) ? bch2_dev_have_ref(c, wbio->dev) : NULL; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_inum_offset_ratelimited(ca, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { @@ -692,10 +714,8 @@ static void bch2_write_endio(struct bio *bio) set_bit(wbio->dev, op->devs_need_flush->d); } - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); + if (wbio->have_ioref) percpu_ref_put(&ca->io_ref); - } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -729,7 +749,7 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); bch2_keylist_push(&op->insert_keys); } @@ -789,7 +809,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -797,10 +816,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -810,44 +829,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) - return PREP_ENCODED_OK; + struct bch_csum csum; + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -858,12 +845,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -871,20 +859,24 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; + + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } - if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -896,22 +888,44 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } - return PREP_ENCODED_OK; + return 0; +csum_err: + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while moving existing data (memory corruption?)\n" + " expected %0llx:%0llx got %0llx:%0llx type %s", + op->crc.csum.hi, + op->crc.csum.lo, + csum.hi, + csum.lo, + op->crc.csum_type < BCH_CSUM_NR + ? __bch2_csum_types[op->crc.csum_type] + : "(unknown)"); + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -926,43 +940,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bool page_alloc_failed = false; int ret, more = 0; + if (op->incompressible) + op->compression_opt = 0; + BUG_ON(!bio_sectors(src)); ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -976,7 +998,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -1014,7 +1036,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -1032,12 +1054,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, * data can't be modified (by userspace) while it's in * flight. */ - if (bch2_rechecksum_bio(c, src, version, op->crc, + ret = bch2_rechecksum_bio(c, src, version, op->crc, &crc, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->csum_type)) - goto csum_err; + op->csum_type); + if (ret) + goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing @@ -1046,13 +1069,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && - bch2_rechecksum_bio(c, src, version, op->crc, + if ((op->flags & BCH_WRITE_data_encoded) && + (ret = bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->crc.csum_type)) - goto csum_err; + op->crc.csum_type))) + goto err; crc.compressed_size = dst_len >> 9; crc.uncompressed_size = src_len >> 9; @@ -1072,6 +1095,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1103,16 +1134,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, do_write: *_dst = dst; return more; -csum_err: - { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = -EIO; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1190,39 +1211,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -BCH_ERR_data_write_io; + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1251,7 +1269,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bucket_to_lock *stale_at; int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1309,7 +1327,7 @@ retry: }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1317,7 +1335,7 @@ retry: bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { @@ -1342,7 +1360,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1352,11 +1370,12 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) break; bch2_btree_iter_advance(&iter); } @@ -1370,21 +1389,18 @@ err: darray_exit(&buckets); if (ret) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1414,7 +1430,7 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1436,7 +1452,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1466,7 +1482,7 @@ again: ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, @@ -1489,16 +1505,12 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1524,14 +1536,14 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_SUBMITTED) && - !(op->flags & BCH_WRITE_IN_WORKER))) { + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1552,8 +1564,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) memset(&op->failed, 0, sizeof(op->failed)); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1616,8 +1628,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); - if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - op->flags |= BCH_WRITE_ALLOC_NOWAIT; + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); @@ -1625,11 +1637,8 @@ CLOSURE_CALLBACK(bch2_write) wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); - printbuf_exit(&buf); - op->error = -EIO; + bch2_write_op_error(op, op->pos.offset, "misaligned write"); + op->error = -BCH_ERR_data_write_misaligned; goto err; } @@ -1638,13 +1647,14 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && + if (!(op->flags & BCH_WRITE_move) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_move)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1675,20 +1685,26 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b4626013abc8..b8ab19a1e1da 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -11,33 +11,27 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); +__printf(3, 4) +void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); #define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(SUBMITTED) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) enum __bch_write_flags { #define x(f) __BCH_WRITE_##f, diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 6e878a6f2f0b..3ef6df9145ef 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -64,7 +64,7 @@ struct bch_write_op { struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_ENCODED: */ + /* For BCH_WRITE_data_encoded: */ struct bch_extent_crc_unpacked crc; struct write_point_specifier write_point; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 05b1250619ec..8a36d5536668 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -20,13 +20,6 @@ #include "journal_seq_blacklist.h" #include "trace.h" -static const char * const bch2_journal_errors[] = { -#define x(n) #n, - JOURNAL_ERRORS() -#undef x - NULL -}; - static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -56,14 +49,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); + if (!buf->write_started) + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); + struct closure *cl = &buf->io; + int r = atomic_read(&cl->remaining); + prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); + + if (buf->data) { + prt_printf(out, "size:\t"); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + } - prt_printf(out, "expires:\t"); - prt_printf(out, "%li jiffies\n", buf->expires - jiffies); + prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); prt_printf(out, "flags:\t"); if (buf->noflush) @@ -87,6 +86,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { + lockdep_assert_held(&j->lock); + out->atomic++; + if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -95,6 +97,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); + + --out->atomic; } static inline struct journal_buf * @@ -104,10 +108,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) EBUG_ON(seq > journal_cur_seq(j)); - if (journal_seq_unwritten(j, seq)) { + if (journal_seq_unwritten(j, seq)) buf = j->buf + (seq & JOURNAL_BUF_MASK); - EBUG_ON(le64_to_cpu(buf->data->seq) != seq); - } return buf; } @@ -139,8 +141,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bool stuck = false; struct printbuf buf = PRINTBUF; - if (!(error == JOURNAL_ERR_journal_full || - error == JOURNAL_ERR_journal_pin_full) || + buf.atomic++; + + if (!(error == -BCH_ERR_journal_full || + error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; @@ -164,12 +168,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } j->err_seq = journal_cur_seq(j); - spin_unlock(&j->lock); - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", - bch2_journal_errors[error]); - bch2_journal_debug_to_text(&buf, j); - bch_err(c, "%s", buf.buf); + __bch2_journal_debug_to_text(&buf, j); + spin_unlock(&j->lock); + prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), + bch2_err_str(error)); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); @@ -195,7 +199,8 @@ void bch2_journal_do_writes(struct journal *j) if (w->write_started) continue; - if (!journal_state_count(j->reservations, idx)) { + if (!journal_state_seq_count(j, j->reservations, seq)) { + j->seq_write_started = seq; w->write_started = true; closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -306,7 +311,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_space_available(j); - __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt(struct journal *j) @@ -377,29 +382,41 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return JOURNAL_ERR_blocked; + return -BCH_ERR_journal_blocked; if (j->cur_entry_error) return j->cur_entry_error; - if (bch2_journal_error(j)) - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + int ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (!fifo_free(&j->pin)) - return JOURNAL_ERR_journal_pin_full; + return -BCH_ERR_journal_pin_full; if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; + return -BCH_ERR_journal_max_in_flight; + + if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) + return -BCH_ERR_journal_max_open; if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + return -BCH_ERR_journal_shutdown; } + if (!j->free_buf && !buf->data) + return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + BUG_ON(!j->cur_entry_sectors); + if (!buf->data) { + swap(buf->data, j->free_buf); + swap(buf->buf_size, j->free_buf_size); + } + buf->expires = (journal_cur_seq(j) == j->flushed_seq_ondisk ? jiffies @@ -415,7 +432,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return JOURNAL_ERR_journal_full; + return -BCH_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -464,7 +481,7 @@ static int journal_entry_open(struct journal *j) new.idx++; BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); journal_state_inc(&new); @@ -514,6 +531,33 @@ static void journal_write_work(struct work_struct *work) spin_unlock(&j->lock); } +static void journal_buf_prealloc(struct journal *j) +{ + if (j->free_buf && + j->free_buf_size >= j->buf_size_want) + return; + + unsigned buf_size = j->buf_size_want; + + spin_unlock(&j->lock); + void *buf = kvmalloc(buf_size, GFP_NOFS); + spin_lock(&j->lock); + + if (buf && + (!j->free_buf || + buf_size > j->free_buf_size)) { + swap(buf, j->free_buf); + swap(buf_size, j->free_buf_size); + } + + if (unlikely(buf)) { + spin_unlock(&j->lock); + /* kvfree can sleep */ + kvfree(buf); + spin_lock(&j->lock); + } +} + static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -525,25 +569,28 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (j->blocked) - return -BCH_ERR_journal_res_get_blocked; + return -BCH_ERR_journal_blocked; if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = JOURNAL_ERR_max_in_flight; + ret = -BCH_ERR_journal_max_in_flight; goto out; } spin_lock(&j->lock); + journal_buf_prealloc(j); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() @@ -566,25 +613,48 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; + ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); out: - if (ret == JOURNAL_ERR_retry) - goto retry; - if (!ret) + if (likely(!ret)) return 0; + if (ret == -BCH_ERR_journal_retry_open) + goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_res_get_blocked; + ret = -BCH_ERR_journal_stuck; + + if (ret == -BCH_ERR_journal_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && + trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; - if (ret == JOURNAL_ERR_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + bch2_printbuf_make_room(&buf, 4096); + spin_lock(&j->lock); + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + + if (ret == -BCH_ERR_journal_max_open && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && + trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); @@ -594,8 +664,8 @@ out: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && + if ((ret == -BCH_ERR_journal_full || + ret == -BCH_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -608,9 +678,7 @@ out: } } - return ret == JOURNAL_ERR_insufficient_devices - ? -BCH_ERR_erofs_journal_err - : -BCH_ERR_journal_res_get_blocked; + return ret; } static unsigned max_dev_latency(struct bch_fs *c) @@ -640,7 +708,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), HZ)) return ret; @@ -654,19 +722,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait = max(0, remaining_wait - HZ); if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), remaining_wait)) return ret; struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); - bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", - buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); printbuf_exit(&buf); closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } @@ -687,7 +755,6 @@ void bch2_journal_entry_res_resize(struct journal *j, goto out; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - smp_mb(); state = READ_ONCE(j->reservations); if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && @@ -907,7 +974,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -EROFS; + return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); bch2_write_ref_put(c, BCH_WRITE_REF_journal); @@ -951,7 +1018,8 @@ static void __bch2_journal_block(struct journal *j) new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -992,7 +1060,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou *blocked = true; } - ret = journal_state_count(s, idx) > open + ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -1349,6 +1417,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; @@ -1389,8 +1458,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - j->reservations.unwritten_idx++; + j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1443,7 +1511,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; @@ -1482,6 +1550,7 @@ void bch2_fs_journal_exit(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvfree(j->buf[i].data); + kvfree(j->free_buf); free_fifo(&j->pin); } @@ -1508,13 +1577,13 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { - j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); - if (!j->buf[i].data) - return -BCH_ERR_ENOMEM_journal_buf; + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; + j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); + if (!j->free_buf) + return -BCH_ERR_ENOMEM_journal_buf; + + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - } j->pin.front = j->pin.back = 1; @@ -1564,6 +1633,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "average write size:\t"); prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); prt_newline(out); + prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); @@ -1571,7 +1641,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 107f7f901cd9..47828771f9c2 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) closure_wake_up(&j->async_wait); } -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) return j->seq_ondisk + 1; } +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + unsigned idx = (journal_cur_seq(j) & + JOURNAL_BUF_MASK & + ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; + + return j->buf + idx; +} + static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { @@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) BUG(); } +static inline int journal_state_seq_count(struct journal *j, + union journal_res_state s, u64 seq) +{ + if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) + return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); + else + return 0; +} + static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; @@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { - return vstruct_idx(j->buf[res->idx].data, res->offset); + return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, @@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); -static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s bch2_journal_buf_put_final(j, seq); } -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx, res->seq); + bch2_journal_buf_put(j, res->seq); res->ref = 0; } @@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j, /* * Check if there is still room in the current journal - * entry: + * entry, smp_rmb() guarantees that reads from reservations.counter + * occur before accessing cur_entry_u64s: */ + smp_rmb(); if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; @@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j, &old.v, new.v)); res->ref = true; - res->idx = old.idx; res->offset = old.cur_entry_offset; - res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + res->seq = journal_cur_seq(j); + res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; return 1; } @@ -390,6 +407,7 @@ out: (flags & JOURNAL_RES_GET_NONBLOCK) != 0, NULL, _THIS_IP_); EBUG_ON(!res->ref); + BUG_ON(!res->seq); } return 0; } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 11c39e0c34f4..2debc213e47c 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -214,12 +214,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, fsck_err_on(same_device, c, journal_entry_dup_same_device, - "duplicate journal entry on same device\n %s", + "duplicate journal entry on same device\n%s", buf.buf); fsck_err_on(not_identical, c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries\n %s", + "found duplicate but non identical journal entries\n%s", buf.buf); if (entry_ptr.csum_good && !identical) @@ -308,8 +308,8 @@ static void journal_entry_err_msg(struct printbuf *out, break; \ case WRITE: \ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ - bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ - if (bch2_fs_inconsistent(c)) { \ + if (bch2_fs_inconsistent(c, \ + "corrupt metadata before write: %s\n", _buf.buf)) {\ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ @@ -764,6 +764,23 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_log_bkey_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + struct bkey_validate_context from) +{ + from.flags = 0; + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, from); +} + +static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, @@ -1041,13 +1058,19 @@ reread: bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); + u64 submit_time = local_clock(); ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, - "journal read error: sector %llu", - offset) || - bch2_meta_read_fault("journal")) { + if (!ret && bch2_meta_read_fault("journal")) + ret = -BCH_ERR_EIO_fault_injected; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); + + if (ret) { + bch_err_dev_ratelimited(ca, + "journal read error: sector %llu", offset); /* * We don't error out of the recovery process * here, since the relevant journal entry may be @@ -1110,13 +1133,16 @@ reread: struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); - if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { + bch_err_dev_ratelimited(ca, "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf)); saw_bad = true; + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1362,8 +1388,8 @@ int bch2_journal_read(struct bch_fs *c, missing_end = seq - 1; fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - " prev at %s\n" - " next at %s, continue?", + "prev at %s\n" + "next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1417,7 +1443,7 @@ int bch2_journal_read(struct bch_fs *c, !bch2_replicas_marked(c, &replicas.e) && (le64_to_cpu(i->j.seq) == *last_seq || fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n %s", + "superblock not marked as containing replicas for journal entry %llu\n%s", le64_to_cpu(i->j.seq), buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) @@ -1515,7 +1541,7 @@ static void __journal_write_alloc(struct journal *j, * @j: journal object * @w: journal buf (entry to be written) * - * Returns: 0 on success, or -EROFS on failure + * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure */ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { @@ -1600,18 +1626,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_replicas_padded replicas; - union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1620,13 +1640,13 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + if (!bch2_journal_error(j)) + bch_err(c, "unable to write journal to sufficient devices"); + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; + err = bch2_mark_replicas(c, &replicas.e); } if (err) @@ -1641,7 +1661,23 @@ static CLOSURE_CALLBACK(journal_write_done) j->err_seq = seq; w->write_done = true; + if (!j->free_buf || j->free_buf_size < w->buf_size) { + swap(j->free_buf, w->data); + swap(j->free_buf_size, w->buf_size); + } + + if (w->data) { + void *buf = w->data; + w->data = NULL; + w->buf_size = 0; + + spin_unlock(&j->lock); + kvfree(buf); + spin_lock(&j->lock); + } + bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1650,11 +1686,10 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->write_done) break; - if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !w->noflush) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1671,16 +1706,6 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); - BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - - new.unwritten_idx++; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - closure_wake_up(&w->wait); completed = true; } @@ -1695,7 +1720,7 @@ static CLOSURE_CALLBACK(journal_write_done) } if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1715,6 +1740,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) @@ -1724,13 +1752,16 @@ static void journal_write_endio(struct bio *bio) struct journal *j = &ca->fs->journal; struct journal_buf *w = j->buf + jbio->buf_idx; - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + jbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("journal")) { - unsigned long flags; + bch2_blk_status_to_str(bio->bi_status)); + unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); @@ -1759,7 +1790,11 @@ static CLOSURE_CALLBACK(journal_write_submit) sectors); struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; + struct journal_bio *jbio = ja->bio[w->idx]; + struct bio *bio = &jbio->bio; + + jbio->submit_time = local_clock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1791,6 +1826,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { @@ -1984,7 +2023,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * write anything at all. */ if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return -EIO; + return error; if (error || w->noflush || @@ -2060,12 +2099,12 @@ CLOSURE_CALLBACK(bch2_journal_write) struct printbuf buf = PRINTBUF; buf.atomic++; + __bch2_journal_debug_to_text(&buf, j); + spin_unlock(&j->lock); prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), le64_to_cpu(w->data->seq), vstruct_sectors(w->data, c->block_bits), bch2_err_str(ret)); - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index d373cd181a7f..5d1547aa118a 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j) bch_err(c, "%s", buf.buf); printbuf_exit(&buf); - ret = JOURNAL_ERR_insufficient_devices; + ret = -BCH_ERR_insufficient_journal_devices; goto out; } @@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } bch2_journal_do_discards(j); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 1f25c111c54c..e463d2d95359 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - unsigned i; - for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); - src < bl->start + nr; - src++, i = eytzinger0_next(i, nr)) { + src = bl->start; + eytzinger0_for_each(i, nr) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; + src++; } unsigned new_nr = dst - bl->start; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 1ef3a28ed6ab..8e0eba776b9d 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -12,7 +12,11 @@ /* btree write buffer steals 8 bits for its own purposes: */ #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) -#define JOURNAL_BUF_BITS 2 +#define JOURNAL_STATE_BUF_BITS 2 +#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) +#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) + +#define JOURNAL_BUF_BITS 4 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -82,7 +86,6 @@ struct journal_entry_pin { struct journal_res { bool ref; - u8 idx; u16 u64s; u32 offset; u64 seq; @@ -98,9 +101,8 @@ union journal_res_state { }; struct { - u64 cur_entry_offset:20, + u64 cur_entry_offset:22, idx:2, - unwritten_idx:2, buf0_count:10, buf1_count:10, buf2_count:10, @@ -110,13 +112,13 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ /* * We stash some journal state as sentinal values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) @@ -149,28 +151,12 @@ enum journal_flags { #undef x }; -/* Reasons we may fail to get a journal reservation: */ -#define JOURNAL_ERRORS() \ - x(ok) \ - x(retry) \ - x(blocked) \ - x(max_in_flight) \ - x(journal_full) \ - x(journal_pin_full) \ - x(journal_stuck) \ - x(insufficient_devices) - -enum journal_errors { -#define x(n) JOURNAL_ERR_##n, - JOURNAL_ERRORS() -#undef x -}; - typedef DARRAY(u64) darray_u64; struct journal_bio { struct bch_dev *ca; unsigned buf_idx; + u64 submit_time; struct bio bio; }; @@ -199,7 +185,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum journal_errors cur_entry_error; + int cur_entry_error; unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; @@ -220,6 +206,8 @@ struct journal { * other is possibly being written out. */ struct journal_buf buf[JOURNAL_BUF_NR]; + void *free_buf; + unsigned free_buf_size; spinlock_t lock; @@ -237,6 +225,7 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; + u64 seq_write_started; /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ce794d55818f..2f63fc6d456f 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" +#include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) +int __bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { if (old_time == new_time) return 0; @@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = { }; int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, u64 time, + u16 lru_id, + u64 dev_bucket, + u64 time, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { @@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans, struct btree_iter lru_iter; struct bkey_s_c lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, - bucket_to_u64(referring_k.k->p), - time), 0); + lru_pos(lru_id, dev_bucket, time), 0); int ret = bkey_err(lru_k); if (ret) return ret; @@ -100,11 +101,10 @@ int bch2_lru_check_set(struct btree_trans *trans, goto err; if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n" - " %s", + "missing %s lru entry\n%s", bch2_lru_types[lru_type(lru_k)], (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) goto err; } @@ -116,57 +116,81 @@ fsck_err: return ret; } +static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) +{ + enum bch_lru_type type = lru_type(lru_k); + + switch (type) { + case BCH_LRU_read: + case BCH_LRU_fragmentation: + return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + case BCH_LRU_stripes: + return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); + default: + BUG(); + } +} + +static u64 bkey_lru_type_idx(struct bch_fs *c, + enum bch_lru_type type, + struct bkey_s_c k) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + switch (type) { + case BCH_LRU_read: + a = bch2_alloc_to_v4(k, &a_convert); + return alloc_lru_idx_read(*a); + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + u64 idx = ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; + rcu_read_unlock(); + return idx; + } + case BCH_LRU_stripes: + return k.k->type == KEY_TYPE_stripe + ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) + : 0; + default: + BUG(); + } +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - enum bch_lru_type type = lru_type(lru_k); - struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); - u64 idx; - int ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); - if (fsck_err_on(!ca, - trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + struct bbpos bp = lru_pos_to_bp(lru_k); - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + int ret = bkey_err(k); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - - switch (type) { - case BCH_LRU_read: - idx = alloc_lru_idx_read(*a); - break; - case BCH_LRU_fragmentation: - idx = alloc_lru_idx_fragmentation(*a, ca); - break; - } + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); - if (lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx) { + if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) goto err; if (fsck_err(trans, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" - " %s\n" - " for %s", + "%s\n" + "for %s", bch2_lru_types[type], lru_pos_time(lru_k.k->p), (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), @@ -176,7 +200,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index f31a6cf1514c..8abd0aa2083a 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_FRAGMENTATION_START) + switch (lru_id) { + case BCH_LRU_BUCKET_FRAGMENTATION: return BCH_LRU_fragmentation; - return BCH_LRU_read; + case BCH_LRU_STRIPE_FRAGMENTATION: + return BCH_LRU_stripes; + default: + return BCH_LRU_read; + } } int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); @@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +static inline int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + return old_time != new_time + ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) + : 0; +} struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index f372cb3b8cda..b7392ad8e41f 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -9,7 +9,8 @@ struct bch_lru { #define BCH_LRU_TYPES() \ x(read) \ - x(fragmentation) + x(fragmentation) \ + x(stripes) enum bch_lru_type { #define x(n) BCH_LRU_##n, @@ -17,7 +18,8 @@ enum bch_lru_type { #undef x }; -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) +#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) +#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ddc187fb693d..57ad662871ba 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -15,6 +15,7 @@ #include "keylist.h" #include "migrate.h" #include "move.h" +#include "progress.h" #include "replicas.h" #include "super-io.h" @@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_usrdata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + })); if (ret) break; } @@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } -static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_metadata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, int flags) { struct btree_trans *trans; struct btree_iter iter; @@ -125,6 +132,8 @@ retry: while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { + bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; @@ -169,6 +178,11 @@ err: int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags); + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink)); + + return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, &progress, dev_idx, flags); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 160b4374160a..5d41260e10da 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, +static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - if (trace_move_extent_enabled()) { + if (trace_io_move_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_move_extent(c, buf.buf); + trace_io_move(c, buf.buf); printbuf_exit(&buf); } } -static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_move_extent_read_enabled()) { + if (trace_io_move_read_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_read(c, buf.buf); + trace_io_move_read(c, buf.buf); printbuf_exit(&buf); } } @@ -74,11 +74,7 @@ struct moving_io { unsigned read_sectors; unsigned write_sectors; - struct bch_read_bio rbio; - struct data_update write; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -88,43 +84,72 @@ static void move_free(struct moving_io *io) if (io->b) atomic_dec(&io->b->count); - bch2_data_update_exit(&io->write); - mutex_lock(&ctxt->lock); list_del(&io->io_list); wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); + } else { + bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + kfree(io->write.bvecs); + } kfree(io); } static void move_write_done(struct bch_write_op *op) { struct moving_io *io = container_of(op, struct moving_io, write.op); + struct bch_fs *c = op->c; struct moving_context *ctxt = io->write.ctxt; - if (io->write.op.error) + if (op->error) { + if (trace_io_move_write_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_write_op_to_text(&buf, op); + prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); + trace_io_move_write_fail(c, buf.buf); + printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + ctxt->write_error = true; + } - atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_dec(&io->write.ctxt->write_ios); + atomic_sub(io->write_sectors, &ctxt->write_sectors); + atomic_dec(&ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } static void move_write(struct moving_io *io) { - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + struct moving_context *ctxt = io->write.ctxt; + + if (ctxt->stats) { + if (io->write.rbio.bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (io->write.rbio.saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + if (unlikely(io->write.rbio.ret || + io->write.rbio.bio.bi_status || + io->write.data_opts.scrub)) { move_free(io); return; } - if (trace_move_extent_write_enabled()) { + if (trace_io_move_write_enabled()) { struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_move_extent_write(c, buf.buf); + trace_io_move_write(c, buf.buf); printbuf_exit(&buf); } @@ -132,7 +157,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc); + bch2_data_update_read_done(&io->write); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -145,7 +170,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx static void move_read_endio(struct bio *bio) { - struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); @@ -258,14 +283,10 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct moving_io *io; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - trace_move_extent2(c, k, &io_opts, &data_opts); + trace_io_move2(c, k, &io_opts, &data_opts); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); @@ -273,7 +294,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -285,13 +307,7 @@ int bch2_move_extent(struct moving_context *ctxt, */ bch2_trans_unlock(trans); - /* write path might have to decompress data: */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); - - pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - io = kzalloc(sizeof(struct moving_io) + - sizeof(struct bio_vec) * pages, GFP_KERNEL); + struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); if (!io) goto err; @@ -300,31 +316,27 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->write.op.wbio.bio.bi_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, - GFP_KERNEL)) - goto err_free; + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; - io->rbio.c = c; - io->rbio.opts = io_opts; - bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->rbio.bio.bi_vcnt = pages; - io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->rbio.bio.bi_iter.bi_size = sectors << 9; + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + bch2_bkey_buf_reassemble(&io->write.k, c, k); - io->rbio.bio.bi_opf = REQ_OP_READ; - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = move_read_endio; + io->write.op.c = c; + io->write.data_opts = data_opts; - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free_pages; + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) + goto err_free; + } - io->write.op.end_io = move_write_done; + io->write.rbio.bio.bi_end_io = move_read_endio; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -339,9 +351,7 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); - trace_move_extent_read2(c, k); + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -356,33 +366,33 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_NODECODE| - BCH_READ_LAST_FRAGMENT); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free_pages: - bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - if (ret == -BCH_ERR_data_update_done) + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) return 0; if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - count_event(c, move_extent_start_fail); + count_event(c, io_move_start_fail); - if (trace_move_extent_start_fail_enabled()) { + if (trace_io_move_start_fail_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, ": "); prt_str(&buf, bch2_err_str(ret)); - trace_move_extent_start_fail(c, buf.buf); + trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } return ret; @@ -518,6 +528,37 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } +/* + * Move requires non extents iterators, and there's also no need for it to + * signal indirect_extent_missing_error: + */ +static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_reflink_p p) +{ + if (unlikely(REFLINK_P_ERROR(p.v))) + return bkey_s_c_null; + + struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); + + bch2_trans_iter_init(trans, iter, + BTREE_ID_reflink, reflink_pos, + BTREE_ITER_not_extents); + + struct bkey_s_c k = bch2_btree_iter_peek(iter); + if (!k.k || bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_null; + } + + return k; +} + static int bch2_move_data_btree(struct moving_context *ctxt, struct bpos start, struct bpos end, @@ -551,6 +592,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| + BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); if (ctxt->rate) @@ -581,17 +623,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (bkey_deleted(k.k)) + if (!k.k) goto next_nondata; /* @@ -627,7 +668,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; - if (ret2 == -ENOMEM) { + if (bch2_err_matches(ret2, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); continue; @@ -689,21 +730,22 @@ int bch2_move_data(struct bch_fs *c, bool wait_on_copygc, move_pred_fn pred, void *arg) { - struct moving_context ctxt; - int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_move_data(&ctxt, start, end, pred, arg); + int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts _data_opts) +static int __bch2_move_data_phys(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + unsigned data_types, + move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -712,16 +754,19 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - struct data_update_opts data_opts; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + struct bch_dev *ca = bch2_dev_tryget(c, dev); if (!ca) return 0; - trace_bucket_evacuate(c, &bucket); + bucket_end = min(bucket_end, ca->mi.nbuckets); + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); + bch2_dev_put(ca); + ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -732,8 +777,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), 0); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) @@ -757,7 +801,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; - if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + if (!k.k || bkey_gt(k.k->p, bp_end)) break; if (k.k->type != KEY_TYPE_backpointer) @@ -765,107 +809,148 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; + if (ctxt->stats) + ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + if (!(data_types & BIT(bp.v->data_type))) + goto next; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); + if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) + goto next; + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + + if (!bp.v->level) { ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } + } - data_opts = _data_opts; - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; - - unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - if (p.ptr.dev == bucket.inode) { - if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - data_opts.rewrite_ptrs |= 1U << i; - break; - } - i++; - } - - ret = bch2_move_extent(ctxt, bucket_in_flight, - &iter, k, io_opts, data_opts); + struct data_update_opts data_opts = {}; + if (!pred(c, arg, k, &io_opts, &data_opts)) { bch2_trans_iter_exit(trans, &iter); + goto next; + } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; - } else { - struct btree *b; + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } - b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!b) - goto next; + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); - unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + /* move_extent will drop locks */ + unsigned sectors = bp.v->bucket_len; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - bch2_trans_iter_exit(trans, &iter); + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; + bch2_trans_iter_exit(trans, &iter); - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, sectors); - if (ctxt->stats) { - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } - sectors_moved += btree_sectors(c); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); next: bch2_btree_iter_advance(&bp_iter); } - - trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); return ret; } +static int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct evacuate_bucket_arg *arg = _arg; + + *data_opts = arg->data_opts; + + unsigned i = 0; + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == arg->bucket.inode && + (arg->gen < 0 || arg->gen == ptr->gen) && + !ptr->cached) + data_opts->rewrite_ptrs |= BIT(i); + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +int bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) +{ + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + ~0, + evacuate_bucket_pred, &arg); +} + typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); @@ -1007,14 +1092,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } -static bool migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - /* * Ancient versions of bcachefs produced packed formats which could represent * keys that the in memory format cannot represent; this checks for those @@ -1104,6 +1181,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); } +static bool scrub_pred(struct bch_fs *c, void *_arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -1118,6 +1219,22 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all + * btree nodes that need to be repaired + */ + bch2_btree_interior_updates_flush(c); + + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -1137,14 +1254,14 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, start, end, - migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; + ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, + ~0, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_rewrite_old_nodes: @@ -1176,17 +1293,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1195,7 +1312,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1215,8 +1333,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) - bch2_write_op_to_text(out, &io->write.op); + bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index e22841ef31e4..807f779f6f76 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -3,22 +3,36 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { - enum bch_data_type data_type; - struct bbpos pos; char name[32]; + bool phys; + enum bch_ioctl_data_event_ret ret; + + union { + struct { + enum bch_data_type data_type; + struct bbpos pos; + }; + struct { + unsigned dev; + u64 offset; + }; + }; atomic64_t keys_moved; atomic64_t keys_raced; atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6718dc37c5a3..5126c870ce5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = 0; @@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "Currently calculated wait:\t"); prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c index 2c3d46ac70c6..ee7251709fb9 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/namei.c @@ -4,8 +4,8 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" @@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; + /* Inherit casefold state from parent. */ + if (S_ISDIR(mode)) + new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded; + if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); @@ -153,16 +157,14 @@ int bch2_create_trans(struct btree_trans *trans, dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - ret = bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates); + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + &dir_u->bi_size, + STR_HASH_must_create|BTREE_ITER_with_updates) ?: + bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; @@ -225,7 +227,9 @@ int bch2_link_trans(struct btree_trans *trans, ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum.inum, &dir_offset, + name, inum.inum, + &dir_offset, + &dir_u->bi_size, STR_HASH_must_create); if (ret) goto err; @@ -417,8 +421,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, + src_dir, &src_hash, &src_dir_u->bi_size, + dst_dir, &dst_hash, &dst_dir_u->bi_size, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -560,6 +564,8 @@ err: return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -650,3 +656,179 @@ disconnected: prt_str_reversed(path, "(disconnected)"); goto out; } + +/* fsck */ + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_newline(&buf), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } + + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h index 2b59210bb5e8..2e6f6364767f 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} + +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6772faf385a5..af3258814822 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -44,7 +44,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -static const char * const __bch2_csum_types[] = { +const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -163,16 +163,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; -u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) -{ - BUG(); -} - -void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) -{ - BUG(); -} - void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -223,6 +213,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } +/* dummy option, for options that aren't stored in the superblock */ +typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); +typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); +typedef u64 (*member_opt_get_fn)(const struct bch_member *); +typedef void (*member_opt_set_fn)(struct bch_member *, u64); + +__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; +__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; +__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; + +#define type_compatible_or_null(_p, _type) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) + const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ @@ -239,15 +244,15 @@ const struct bch_option bch2_opt_table[] = { #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ - .attr = { \ - .name = #_name, \ - .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - }, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = _sb_opt, \ - .set_sb = SET_##_sb_opt, \ + .attr.name = #_name, \ + .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ + .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ + .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ + .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ _type \ }, @@ -475,11 +480,16 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) { int ret = 0; switch (id) { + case Opt_state: + if (ca) + return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); + break; + case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); @@ -495,12 +505,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) int bch2_opts_check_may_set(struct bch_fs *c) { - unsigned i; - int ret; - - for (i = 0; i < bch2_opts_nr; i++) { - ret = bch2_opt_check_may_set(c, i, - bch2_opt_get_by_id(&c->opts, i)); + for (unsigned i = 0; i < bch2_opts_nr; i++) { + int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -543,14 +549,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, goto bad_opt; ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) { - ret = -ENOMEM; - goto out; + if (ret == -BCH_ERR_option_needs_open_fs) { + ret = 0; + + if (parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) + ret = -ENOMEM; } - ret = 0; goto out; } @@ -561,28 +568,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, bch2_opt_set_by_id(opts, id, v); ret = 0; - goto out; - +out: + printbuf_exit(&err); + return ret; bad_opt: - pr_err("Bad mount option %s", name); ret = -BCH_ERR_option_name; goto out; - bad_val: - pr_err("Invalid mount option %s", err.buf); ret = -BCH_ERR_option_value; - -out: - printbuf_exit(&err); - return ret; + goto out; } int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options) + struct printbuf *parse_later, char *options, + bool ignore_unknown) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; - int ret; + int ret = 0; if (!options) return 0; @@ -607,24 +610,37 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, val = opt; ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret < 0) - goto out; + if (ret == -BCH_ERR_option_name && ignore_unknown) + ret = 0; + if (ret) { + pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); + break; + } } - ret = 0; - goto out; - -out: kfree(copied_opts_start); return ret; } -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) { const struct bch_option *opt = bch2_opt_table + id; u64 v; - v = opt->get_sb(sb); + if (dev_idx < 0) { + v = opt->get_sb(sb); + } else { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return 0; + + struct bch_member m = bch2_sb_member_get(sb, dev_idx); + v = opt->get_member(&m); + } + + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + --v; if (opt->flags & OPT_SB_FIELD_ILOG2) v = 1ULL << v; @@ -641,35 +657,19 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) */ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { - unsigned id; - - for (id = 0; id < bch2_opts_nr; id++) { + for (unsigned id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb == BCH2_NO_SB_OPT) - continue; - - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + if (opt->get_sb) + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); } return 0; } -struct bch_dev_sb_opt_set { - void (*set_sb)(struct bch_member *, u64); -}; - -static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { -#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, - BCH_DEV_OPT_SETTERS() -#undef x -}; - void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { - enum bch_opt_id id = opt - bch2_opt_table; - if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,24 +679,16 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if (opt->flags & OPT_FS) { - if (opt->set_sb != SET_BCH2_NO_SB_OPT) - opt->set_sb(sb, v); - } + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) + opt->set_sb(sb, v); - if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) return; - struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - - const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; - if (set->set_sb) - set->set_sb(m, v); - else - pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); } } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9d397fc2a1f0..4d06313076ff 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,6 +16,7 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; +extern const char * const __bch2_csum_types[]; extern const char * const __bch2_csum_opts[]; extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; @@ -50,10 +51,6 @@ static inline const char *bch2_d_type_str(unsigned d_type) * apply the options from that struct that are defined. */ -/* dummy option, for options that aren't stored in the superblock */ -u64 BCH2_NO_SB_OPT(const struct bch_sb *); -void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); - /* When can be set: */ enum opt_flags { OPT_FS = BIT(0), /* Filesystem option */ @@ -132,19 +129,24 @@ enum fsck_err_opts { OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 8, \ + BCH_SB_BLOCK_SIZE, 4 << 10, \ "size", NULL) \ x(btree_node_size, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 512, \ + BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ @@ -181,6 +183,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ @@ -197,7 +204,7 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ @@ -308,11 +315,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ - x(discard, u8, \ - OPT_FS|OPT_MOUNT|OPT_DEVICE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -493,27 +495,32 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ - x(fs_size, u64, \ - OPT_DEVICE, \ + x(state, u64, \ + OPT_DEVICE|OPT_RUNTIME, \ + OPT_STR(bch2_member_states), \ + BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ + "state", "rw,ro,failed,spare") \ + x(bucket_size, u32, \ + OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ - x(bucket, u32, \ - OPT_DEVICE, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ + BCH_MEMBER_BUCKET_SIZE, 0, \ "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ - OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ + OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH2_NO_SB_OPT, 1, \ + BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ x(data_allowed, u8, \ OPT_DEVICE, \ OPT_BITFIELD(__bch2_data_types), \ - BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ + x(discard, u8, \ + OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_MEMBER_DISCARD, true, \ + NULL, "Enable discard/TRIM support") \ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -521,11 +528,6 @@ enum fsck_err_opts { NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") -#define BCH_DEV_OPT_SETTERS() \ - x(discard, BCH_MEMBER_DISCARD) \ - x(durability, BCH_MEMBER_DURABILITY) \ - x(data_allowed, BCH_MEMBER_DATA_ALLOWED) - struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -582,8 +584,6 @@ struct printbuf; struct bch_option { struct attribute attr; - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); enum opt_type type; enum opt_flags flags; u64 min, max; @@ -595,6 +595,12 @@ struct bch_option { const char *hint; const char *help; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + + u64 (*get_member)(const struct bch_member *); + void (*set_member)(struct bch_member *, u64); + }; extern const struct bch_option bch2_opt_table[]; @@ -603,7 +609,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); @@ -625,12 +631,12 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *); + char *, bool); /* inode opts: */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index 4cf5a2af1e6f..3302bbc78a09 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -277,6 +277,25 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) } /** + * bch2_printbuf_indent_add_nextline() - add to the current indent level for + * subsequent lines + * + * @buf: printbuf to control + * @spaces: number of spaces to add to the current indent level + * + * Subsequent lines - not the current line - will be indented by @spaces more + * spaces. + */ +void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) + spaces = 0; + + buf->indent += spaces; + buf->has_indent_or_tabstops = true; +} + +/** * bch2_printbuf_indent_sub() - subtract from the current indent level * * @buf: printbuf to control diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index d0dd398baa2b..1ca476adbf6f 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -112,6 +112,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *); int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); void bch2_printbuf_indent_sub(struct printbuf *, unsigned); void bch2_prt_newline(struct printbuf *); diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c new file mode 100644 index 000000000000..d09898566abe --- /dev/null +++ b/fs/bcachefs/progress.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "disk_accounting.h" +#include "progress.h" + +void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc; + disk_accounting_key_init(acc, btree, .id = i); + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +void bch2_progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h new file mode 100644 index 000000000000..23fb1811f943 --- /dev/null +++ b/fs/bcachefs/progress.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_PROGRESS_H +#define _BCACHEFS_PROGRESS_H + +/* + * Lame progress indicators + * + * We don't like to use these because they print to the dmesg console, which is + * spammy - we much prefer to be wired up to a userspace programm (e.g. via + * thread_with_file) and have it print the progress indicator. + * + * But some code is old and doesn't support that, or runs in a context where + * that's not yet practical (mount). + */ + +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); +void bch2_progress_update_iter(struct btree_trans *, + struct progress_indicator_state *, + struct btree_iter *, + const char *); + +#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d0a1f5cd5c2b..b9bde04b66c0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -341,7 +346,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { /* @@ -449,7 +454,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, { data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; return data_opts->rewrite_ptrs != 0; } @@ -590,8 +595,20 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc; + disk_accounting_key_init(acc, rebalance_work); + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v << 9); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +617,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +638,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 71c786cdb192..266c5770c824 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -13,12 +13,12 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" @@ -899,7 +899,7 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += 8; + journal_seq += JOURNAL_BUF_NR * 4; if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0b3c951c32da..593ff142530d 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -234,28 +234,22 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) int bch2_run_online_recovery_passes(struct bch_fs *c) { - int ret = 0; - - down_read(&c->state_lock); - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { struct recovery_pass_fn *p = recovery_pass_fns + i; if (!(p->when & PASS_ONLINE)) continue; - ret = bch2_run_recovery_pass(c, i); + int ret = bch2_run_recovery_pass(c, i); if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { i = c->curr_recovery_pass; continue; } if (ret) - break; + return ret; } - up_read(&c->state_lock); - - return ret; + return 0; } int bch2_run_recovery_passes(struct bch_fs *c) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 418557960ed6..e89b9c783285 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -24,7 +24,7 @@ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ x(check_allocations, 5, PASS_FSCK) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 441e648f28b5..ee23f1f93acc 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, BUG_ON(missing_start < refd_start); BUG_ON(missing_end > refd_end); - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - missing_start, missing_end)) { + struct bpos missing_pos = bkey_start_pos(p.k); + missing_pos.offset += missing_start - live_start; + + prt_printf(&buf, "pointer to missing indirect extent in "); + ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); + if (ret) + goto err; + + prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); + bch2_bkey_val_to_text(&buf, c, p.s_c); + + prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", + missing_start, missing_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ret = PTR_ERR_OR_ZERO(new); if (ret) @@ -314,10 +323,10 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); log_fsck_err(trans, reflink_refcount_underflow, - "indirect extent refcount underflow while marking\n %s", + "indirect extent refcount underflow while marking\n%s", buf.buf); goto next; } @@ -597,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c, u64 dst_done = 0; u32 dst_snapshot, src_snapshot; bool reflink_p_may_update_opts_field = - bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) @@ -786,8 +795,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), trans, reflink_v_refcount_wrong, "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", + "%s\n" + "should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 6992e7469112..2b4b8445d418 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -5,7 +5,13 @@ /* BCH_SB_FIELD_counters */ -static const char * const bch2_counter_names[] = { +static const u8 counters_to_stable_map[] = { +#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, + BCH_PERSISTENT_COUNTERS() +#undef x +}; + +const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x @@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -}; +} static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; -}; +} static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (unsigned i = 0; i < nr; i++) - prt_printf(out, "%s \t%llu\n", - i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", - le64_to_cpu(ctrs->d[i])); -}; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + prt_printf(out, "%s \t%llu\n", + bch2_counter_names[i], + le64_to_cpu(ctrs->d[stable])); + } +} int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - u64 val = 0; - for (i = 0; i < BCH_COUNTER_NR; i++) + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { - val = le64_to_cpu(ctrs->d[i]); - percpu_u64_set(&c->counters[i], val); - c->counters_on_mount[i] = val; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) { + u64 v = le64_to_cpu(ctrs->d[stable]); + percpu_u64_set(&c->counters[i], v); + c->counters_on_mount[i] = v; + } } + return 0; -}; +} int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + } - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) - ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); return 0; } @@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; + +#ifndef NO_BCACHEFS_CHARDEV +long bch2_ioctl_query_counters(struct bch_fs *c, + struct bch_ioctl_query_counters __user *user_arg) +{ + struct bch_ioctl_query_counters arg; + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); + if (ret) + return ret; + + if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || + arg.pad) + return -EINVAL; + + arg.nr = min(arg.nr, BCH_COUNTER_NR); + ret = put_user(arg.nr, &user_arg->nr); + if (ret) + return ret; + + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + + if (stable < arg.nr) { + u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) + ? percpu_u64_get(&c->counters[i]) + : c->counters_on_mount[i]; + + ret = put_user(v, &user_arg->d[stable]); + if (ret) + return ret; + } + } + + return 0; +} +#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h index 81f8aec9fcb1..a4329ad8dd1b 100644 --- a/fs/bcachefs/sb-counters.h +++ b/fs/bcachefs/sb-counters.h @@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); void bch2_fs_counters_exit(struct bch_fs *); int bch2_fs_counters_init(struct bch_fs *); +extern const char * const bch2_counter_names[]; extern const struct bch_sb_field_ops bch_sb_field_ops_counters; +long bch2_ioctl_query_counters(struct bch_fs *, + struct bch_ioctl_query_counters __user *); + #endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fdcf598f08b1..fa27ec59a647 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -9,10 +9,24 @@ enum counters_flags { #define BCH_PERSISTENT_COUNTERS() \ x(io_read, 0, TYPE_SECTORS) \ + x(io_read_inline, 80, TYPE_SECTORS) \ + x(io_read_hole, 81, TYPE_SECTORS) \ + x(io_read_promote, 30, TYPE_COUNTER) \ + x(io_read_bounce, 31, TYPE_COUNTER) \ + x(io_read_split, 33, TYPE_COUNTER) \ + x(io_read_reuse_race, 34, TYPE_COUNTER) \ + x(io_read_retry, 32, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ + x(io_move_read, 35, TYPE_SECTORS) \ + x(io_move_write, 36, TYPE_SECTORS) \ + x(io_move_finish, 37, TYPE_SECTORS) \ + x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ + x(io_move_start_fail, 39, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ x(bucket_alloc, 5, TYPE_COUNTER) \ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ x(btree_cache_scan, 7, TYPE_COUNTER) \ @@ -38,16 +52,6 @@ enum counters_flags { x(journal_reclaim_finish, 27, TYPE_COUNTER) \ x(journal_reclaim_start, 28, TYPE_COUNTER) \ x(journal_write, 29, TYPE_COUNTER) \ - x(read_promote, 30, TYPE_COUNTER) \ - x(read_bounce, 31, TYPE_COUNTER) \ - x(read_split, 33, TYPE_COUNTER) \ - x(read_retry, 32, TYPE_COUNTER) \ - x(read_reuse_race, 34, TYPE_COUNTER) \ - x(move_extent_read, 35, TYPE_SECTORS) \ - x(move_extent_write, 36, TYPE_SECTORS) \ - x(move_extent_finish, 37, TYPE_SECTORS) \ - x(move_extent_fail, 38, TYPE_COUNTER) \ - x(move_extent_start_fail, 39, TYPE_COUNTER) \ x(copygc, 40, TYPE_COUNTER) \ x(copygc_wait, 41, TYPE_COUNTER) \ x(gc_gens_end, 42, TYPE_COUNTER) \ @@ -95,6 +99,13 @@ enum bch_persistent_counters { BCH_COUNTER_NR }; +enum bch_persistent_counters_stable { +#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_STABLE_NR +}; + struct bch_sb_field_counters { struct bch_sb_field field; __le64 d[]; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 051214fdc735..acb5d845841e 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -90,7 +90,13 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(cached_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(stripe_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index b86ec013d7d7..5d43e3504386 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -5,8 +5,7 @@ enum bch_fsck_flags { FSCK_CAN_FIX = 1 << 0, FSCK_CAN_IGNORE = 1 << 1, - FSCK_NO_RATELIMIT = 1 << 2, - FSCK_AUTOFIX = 1 << 3, + FSCK_AUTOFIX = 1 << 2, }; #define BCH_SB_ERRS() \ @@ -179,6 +178,7 @@ enum bch_fsck_flags { x(ptr_crc_redundant, 160, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ + x(extent_flags_not_at_start, 306, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ @@ -310,11 +310,14 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(MAX, 304, 0) + x(dirent_cf_name_too_big, 304, 0) \ + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(MAX, 308, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 762083b564ee..38261638a611 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -23,7 +23,19 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) return !percpu_ref_is_zero(&ca->io_ref); } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; @@ -271,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); if (ca && !percpu_ref_tryget(&ca->io_ref)) diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 2adf1221a440..3affec823b3f 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -79,6 +79,7 @@ struct bch_member { #define BCH_MEMBER_V1_BYTES 56 +LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c54091a28909..0c65065b08ec 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) goto out; } - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); + if (likely(ancestor >= IS_ANCESTOR_BITMAP)) + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) @@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) { u32 id = snapshot_root; u32 subvol = 0, s; @@ -484,7 +485,7 @@ static int check_snapshot_tree(struct btree_trans *trans, root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n %s", + "snapshot tree points to missing/incorrect snapshot:\n%s", (bch2_bkey_val_to_text(&buf, c, st.s_c), prt_newline(&buf), ret @@ -504,19 +505,19 @@ static int check_snapshot_tree(struct btree_trans *trans, if (fsck_err_on(ret, trans, snapshot_tree_to_missing_subvol, - "snapshot tree points to missing subvolume:\n %s", + "snapshot tree points to missing subvolume:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(!bch2_snapshot_is_ancestor(c, le32_to_cpu(subvol.snapshot), root_id), trans, snapshot_tree_to_wrong_subvol, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), trans, snapshot_tree_to_snapshot_subvol, - "snapshot tree points to snapshot subvolume:\n %s", + "snapshot tree points to snapshot subvolume:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { struct bkey_i_snapshot_tree *u; @@ -755,7 +756,7 @@ static int check_snapshot(struct btree_trans *trans, } else { if (fsck_err_on(s.subvol, trans, snapshot_should_not_have_subvol, - "snapshot should not point to subvol:\n %s", + "snapshot should not point to subvol:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -773,7 +774,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(!ret, trans, snapshot_to_bad_snapshot_tree, - "snapshot points to missing/incorrect tree:\n %s", + "snapshot points to missing/incorrect tree:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = snapshot_tree_ptr_repair(trans, iter, k, &s); if (ret) @@ -785,7 +786,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, trans, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n %s", + "snapshot with incorrect depth field, should be %u:\n%s", real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -802,7 +803,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(!ret, trans, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n %s", + "snapshot with bad skiplist field:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 00373cf32e7b..81180181d7c9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) return id; } +u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index d78451c2a0c6..602afca2f5ef 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, for (unsigned i = 0; i < 1000; i++) { unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); if (u64s > U8_MAX) return -EINVAL; @@ -232,7 +232,7 @@ bad_hash: goto out; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 55a4ac7bf220..575ad1e03904 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -12,7 +12,6 @@ #include "super.h" #include <linux/crc32c.h> -#include <crypto/hash.h> #include <crypto/sha2.h> static inline enum bch_str_hash_type @@ -34,6 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) struct bch_hash_info { u8 type; + struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of * the siphash_key (k0) is used as the key. @@ -47,17 +47,17 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) /* XXX ick */ struct bch_hash_info info = { .type = INODE_STR_HASH(bi), +#ifdef CONFIG_UNICODE + .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL, +#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; - desc->tfm = c->sha256; - - crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); + sha256((const u8 *)&bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index b7b96283c316..cd0d8e5e44e7 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -561,6 +561,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + n->v.fs_path_parent = 0; bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index a81a7b6c0989..572b06bfa0b8 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -25,9 +25,6 @@ #include <linux/sort.h> #include <linux/string_choices.h> -static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -}; - struct bch2_metadata_version { u16 version; const char *name; @@ -69,12 +66,14 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && - version <= c->sb.version_incompat_allowed; + int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed) + ? 0 + : -BCH_ERR_may_not_use_incompat_feature; - if (ret) { + if (!ret) { mutex_lock(&c->sb_lock); SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); @@ -366,39 +365,41 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb *sb = disk_sb->sb; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; - u16 block_size; int ret; ret = bch2_sb_compatible(sb, out); if (ret) return ret; - if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - prt_printf(out, "Filesystem has incompatible features"); + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); return -BCH_ERR_invalid_sb_features; } if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_printf(out, "Filesystem has incompatible version"); + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); return -BCH_ERR_invalid_sb_features; } - block_size = le16_to_cpu(sb->block_size); - - if (block_size > PAGE_SECTORS) { - prt_printf(out, "Block size too big (got %u, max %u)", - block_size, PAGE_SECTORS); - return -BCH_ERR_invalid_sb_block_size; - } - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; @@ -409,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_uuid; } + if (!(flags & BCH_VALIDATE_write) && + le64_to_cpu(sb->offset) != read_offset) { + prt_printf(out, "Bad sb offset (got %llu, read from %llu)", + le64_to_cpu(sb->offset), read_offset); + return -BCH_ERR_invalid_sb_offset; + } + if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", @@ -464,6 +472,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ @@ -474,8 +489,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, opt_id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, opt_id, -1); prt_printf(out, "Invalid option "); ret = bch2_opt_validate(opt, v, out); @@ -755,7 +770,7 @@ retry: memset(sb, 0, sizeof(*sb)); sb->mode = BLK_OPEN_READ; sb->have_bio = true; - sb->holder = kmalloc(1, GFP_KERNEL); + sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); if (!sb->holder) return -ENOMEM; @@ -881,7 +896,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, 0, &err); + ret = bch2_sb_validate(sb->sb, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -918,16 +933,16 @@ static void write_super_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; + bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); + /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "superblock %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); ca->sb_write_error = 1; + } closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); @@ -1038,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1166,7 +1181,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -1; + ret = -BCH_ERR_erofs_sb_err; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); @@ -1223,12 +1238,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) bch2_sb_field_resize(&c->disk_sb, downgrade, 0); c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); if (incompat) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); } } @@ -1459,8 +1473,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, id, -1); prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index b4cff9ebdebb..78f708a6fbcd 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -static inline bool bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) +static inline int bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) { return likely(version <= c->sb.version_incompat) - ? true + ? 0 : bch2_set_version_incompat(c, version); } @@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0459c875e189..20208f3c5d8b 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -75,9 +75,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: crc64"); -MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); @@ -536,9 +533,11 @@ int bch2_fs_read_write(struct bch_fs *c) int bch2_fs_read_write_early(struct bch_fs *c) { - lockdep_assert_held(&c->state_lock); + down_write(&c->state_lock); + int ret = __bch2_fs_read_write(c, true); + up_write(&c->state_lock); - return __bch2_fs_read_write(c, true); + return ret; } /* Filesystem startup/shutdown: */ @@ -718,7 +717,7 @@ static int bch2_fs_online(struct bch_fs *c) kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir); + bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; @@ -837,6 +836,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; +#ifdef CONFIG_UNICODE + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } +#else + if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { + printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); + ret = -EINVAL; + goto err; + } +#endif + pr_uuid(&name, c->sb.user_uuid.b); ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) @@ -1003,38 +1021,39 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { time64_t now = ktime_get_real_seconds(); - int ret; + int ret = 0; print_mount_opts(c); down_write(&c->state_lock); + mutex_lock(&c->sb_lock); BUG_ON(test_bit(BCH_FS_started, &c->flags)); - mutex_lock(&c->sb_lock); + if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, + sizeof(struct bch_sb_field_ext) / sizeof(u64))) { + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } ret = bch2_sb_members_v2_init(c); if (ret) { mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); goto err; } for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); - if (!ext) { - bch_err(c, "insufficient space in superblock for sb_field_ext"); - ret = -BCH_ERR_ENOSPC_sb; - goto err; - } - for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + up_write(&c->state_lock); c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) @@ -1050,30 +1069,28 @@ int bch2_fs_start(struct bch_fs *c) goto err; if (bch2_fs_init_fault("fs_start")) { - bch_err(c, "fs_start fault injected"); - ret = -EINVAL; + ret = -BCH_ERR_injected_fs_start; goto err; } set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); + down_write(&c->state_lock); if (c->opts.read_only) { bch2_fs_read_only(c); } else { ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); - if (ret) - goto err; } + up_write(&c->state_lock); - ret = 0; err: if (ret) bch_err_msg(c, ret, "starting filesystem"); else bch_verbose(c, "done starting filesystem"); - up_write(&c->state_lock); return ret; } @@ -1280,8 +1297,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, - "dev-%u", ca->dev_idx); + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: + bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); if (ret) return ret; } @@ -1412,6 +1429,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); + /* + * Stash pointer to the filesystem for blk_holder_ops - note that once + * attached to a filesystem, we will always close the block device + * before tearing down the filesystem object. + */ + ca->disk_sb.holder->c = ca->fs; + ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); @@ -1966,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; @@ -1998,6 +2019,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } +/* blk_holder_ops: */ + +static struct bch_fs *bdev_get_fs(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct bch_sb_handle_holder *holder = bdev->bd_holder; + struct bch_fs *c = holder->c; + + if (c && !bch2_ro_ref_tryget(c)) + c = NULL; + + mutex_unlock(&bdev->bd_holder_lock); + + if (c) + wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); + return c; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) +{ + for_each_member_device(c, ca) + if (ca->disk_sb.bdev == bdev) + return ca; + return NULL; +} + +static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + } + + down_write(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); + if (!ca) + goto unlock; + + if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + __bch2_dev_offline(c, ca); + } else { + if (sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only(c); + } + + bch2_dev_put(ca); +unlock: + if (sb) + up_read(&sb->s_umount); + up_write(&c->state_lock); + bch2_ro_ref_put(c); +} + +static void bch2_fs_bdev_sync(struct block_device *bdev) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + bch2_ro_ref_put(c); +} + +const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + .mark_dead = bch2_fs_bdev_mark_dead, + .sync = bch2_fs_bdev_sync, +}; + /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) @@ -2142,7 +2259,7 @@ BCH_DEBUG_PARAMS() __maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -module_param_named(version, bch2_metadata_version, uint, 0400); +module_param_named(version, bch2_metadata_version, uint, 0444); module_exit(bcachefs_exit); module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 04f8287eff5c..23533bce5709 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; + #endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 368a63d938cf..3a899f799d1d 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -2,13 +2,19 @@ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H +struct bch_fs; + +struct bch_sb_handle_holder { + struct bch_fs *c; +}; + struct bch_sb_handle { struct bch_sb *sb; struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; - void *holder; + struct bch_sb_handle_holder *holder; size_t buffer_size; blk_mode_t mode; unsigned have_layout:1; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a7eb1f511484..e5f003c29369 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,15 +146,14 @@ write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_btree_updates); read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); read_attribute(flags); -read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); -rw_attribute(durability); read_attribute(io_done); read_attribute(io_errors); write_attribute(io_errors_reset); @@ -173,10 +172,8 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); -read_attribute(write_points); read_attribute(nocow_lock_table); #ifdef BCH_WRITE_REF_DEBUG @@ -209,8 +206,6 @@ read_attribute(usage_base); BCH_PERSISTENT_COUNTERS() #undef x -rw_attribute(discard); -read_attribute(state); rw_attribute(label); read_attribute(copy_gc_wait); @@ -262,10 +257,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { - struct disk_accounting_pos a = { - .type = BCH_DISK_ACCOUNTING_compression, - .compression.type = i, - }; + struct disk_accounting_pos a; + disk_accounting_key_init(a, compression, .type = i); struct bpos p = disk_accounting_pos_to_bpos(&a); u64 v[3]; bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); @@ -355,18 +348,12 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); - if (attr == &sysfs_write_points) - bch2_write_points_to_text(out, c); - if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -415,6 +402,9 @@ STORE(bch2_fs) /* Debugging: */ + if (attr == &sysfs_trigger_btree_updates) + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) return -EROFS; @@ -566,10 +556,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, - &sysfs_write_points, #ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, #endif @@ -585,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_btree_updates, &sysfs_gc_gens_pos, @@ -604,26 +593,34 @@ struct attribute *bch2_fs_internal_files[] = { /* options */ -SHOW(bch2_fs_opts_dir) +static ssize_t sysfs_opt_show(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + struct printbuf *out) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int id = opt - bch2_opt_table; - u64 v = bch2_opt_get_by_id(&c->opts, id); + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->flags & OPT_FS) { + v = bch2_opt_get_by_id(&c->opts, id); + } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { + v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); + } else { + return -EINVAL; + } bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); prt_char(out, '\n'); - return 0; } -STORE(bch2_fs_opts_dir) +static ssize_t sysfs_opt_store(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + const char *buf, size_t size) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret, id = opt - bch2_opt_table; - char *tmp; - u64 v; + const struct bch_option *opt = bch2_opt_table + id; + int ret = 0; /* * We don't need to take c->writes for correctness, but it eliminates an @@ -632,27 +629,26 @@ STORE(bch2_fs_opts_dir) if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; - tmp = kstrdup(buf, GFP_KERNEL); + char *tmp = kstrdup(buf, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto err; } - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + u64 v; + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: + bch2_opt_check_may_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - ret = bch2_opt_check_may_set(c, id, v); - if (ret < 0) - goto err; - - bch2_opt_set_sb(c, NULL, opt, v); + bch2_opt_set_sb(c, ca, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if (v && (id == Opt_background_target || + (id == Opt_foreground_target && !c->opts.background_target) || id == Opt_background_compression || (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); @@ -664,27 +660,55 @@ STORE(bch2_fs_opts_dir) c->copygc_thread) wake_up_process(c->copygc_thread); + if (id == Opt_discard && !ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) + opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + ret = size; err: bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_show(c, NULL, id, out); +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_store(c, NULL, id, buf, size); +} SYSFS_OPS(bch2_fs_opts_dir); struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -int bch2_opts_create_sysfs_files(struct kobject *kobj) +int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) { - const struct bch_option *i; - int ret; - - for (i = bch2_opt_table; + for (const struct bch_option *i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->flags & OPT_FS)) + if (i->flags & OPT_HIDDEN) + continue; + if (!(i->flags & type)) continue; - ret = sysfs_create_file(kobj, &i->attr); + int ret = sysfs_create_file(kobj, &i->attr); if (ret) return ret; } @@ -755,11 +779,8 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); - sysfs_print(bucket_size, bucket_bytes(ca)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); - sysfs_print(durability, ca->mi.durability); - sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { if (ca->mi.group) @@ -772,11 +793,6 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state) { - prt_string_option(out, bch2_member_states, ca->mi.state); - prt_char(out, '\n'); - } - if (attr == &sysfs_io_done) dev_io_done_to_text(out, ca); @@ -802,6 +818,10 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_show(c, ca, opt_id, out); + return 0; } @@ -810,18 +830,6 @@ STORE(bch2_dev) struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); - } - - if (attr == &sysfs_durability) { - u64 v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -839,20 +847,20 @@ STORE(bch2_dev) if (attr == &sysfs_io_errors_reset) bch2_dev_errors_reset(ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_store(c, ca, opt_id, buf, size); + return size; } SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, - &sysfs_bucket_size, &sysfs_first_bucket, &sysfs_nbuckets, - &sysfs_durability, /* settings: */ - &sysfs_discard, - &sysfs_state, &sysfs_label, &sysfs_has_data, diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 222cd5062702..303e0433c702 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; extern const struct sysfs_ops bch2_dev_sysfs_ops; -int bch2_opts_create_sysfs_files(struct kobject *); +int bch2_opts_create_sysfs_files(struct kobject *, unsigned); #else @@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; static const struct sysfs_ops bch2_dev_sysfs_ops; -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) +{ return 0; } #endif /* NO_BCACHEFS_SYSFS */ diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index 3fe82757f93a..2c34fe4be912 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -10,6 +10,9 @@ #include "eytzinger.h" #include "time_stats.h" +/* disable automatic switching to percpu mode */ +#define TIME_STATS_NONPCPU ((unsigned long) 1) + static const struct time_unit time_units[] = { { "ns", 1 }, { "us", NSEC_PER_USEC }, @@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - if (!stats->buffer) { + if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { spin_lock_irqsave(&stats->lock, flags); time_stats_update_one(stats, start, end); - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + if (!stats->buffer && + mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, @@ -157,7 +161,7 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) unsigned offset = offsetof(struct bch2_time_stats, min_duration); memset((void *) stats + offset, 0, sizeof(*stats) - offset); - if (stats->buffer) { + if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { int cpu; for_each_possible_cpu(cpu) per_cpu_ptr(stats->buffer, cpu)->nr = 0; @@ -167,7 +171,9 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) void bch2_time_stats_exit(struct bch2_time_stats *stats) { - free_percpu(stats->buffer); + if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) + free_percpu(stats->buffer); + stats->buffer = NULL; } void bch2_time_stats_init(struct bch2_time_stats *stats) @@ -177,3 +183,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); } + +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) +{ + bch2_time_stats_init(stats); + stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index dc6493f7bbab..eddb0985bab4 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) void bch2_time_stats_reset(struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_init(struct bch2_time_stats *); +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) { diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c1b51009edf6..519d00d62ae7 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -295,12 +295,12 @@ TRACE_EVENT(write_super, /* io.c: */ -DEFINE_EVENT(bio, read_promote, +DEFINE_EVENT(bio, io_read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -TRACE_EVENT(read_nopromote, +TRACE_EVENT(io_read_nopromote, TP_PROTO(struct bch_fs *c, int ret), TP_ARGS(c, ret), @@ -319,26 +319,50 @@ TRACE_EVENT(read_nopromote, __entry->ret) ); -DEFINE_EVENT(bio, read_bounce, +DEFINE_EVENT(bio, io_read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, io_read_split, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_retry, +DEFINE_EVENT(bio, io_read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_reuse_race, +DEFINE_EVENT(bio, io_read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, @@ -797,53 +821,37 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -TRACE_EVENT(bucket_evacuate, - TP_PROTO(struct bch_fs *c, struct bpos *bucket), - TP_ARGS(c, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = bucket->inode; - __entry->bucket = bucket->offset; - ), - - TP_printk("%d:%d %u:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) +DEFINE_EVENT(fs_str, io_move, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent, +DEFINE_EVENT(fs_str, io_move_read, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_read, +DEFINE_EVENT(fs_str, io_move_write, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_write, +DEFINE_EVENT(fs_str, io_move_finish, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_finish, +DEFINE_EVENT(fs_str, io_move_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_fail, +DEFINE_EVENT(fs_str, io_move_write_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_start_fail, +DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -881,37 +889,6 @@ TRACE_EVENT(move_data, __entry->sectors_raced) ); -TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, - int ret), - TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, member ) - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->member = bucket->inode; - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; - __entry->ret = ret; - ), - - TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, - __entry->ret) -); - TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 buckets, diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index da2cd11b3025..87af551692f4 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -270,7 +270,7 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, locked = console_trylock(); } - while (1) { + while (*lines) { p = strchrnul(lines, '\n'); printk("%s%.*s\n", prefix, (int) (p - lines), lines); if (!*p) @@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + eytzinger0_for_each(j, NR_QUANTILES) { + bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - u64 q = max(quantiles->entries[i].m, last_q); + u64 q = max(quantiles->entries[j].m, last_q); prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); @@ -704,12 +704,33 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + #if 0 void eytzinger1_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("1 based eytzinger test:"); + pr_info("1 based eytzinger test:\n"); for (size = 2; size < 65536; @@ -717,13 +738,7 @@ void eytzinger1_test(void) unsigned extra = eytzinger1_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); - BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); - - BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); - BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + pr_info("tree size %u\n", size); inorder = 1; eytzinger1_for_each(eytz, size) { @@ -734,15 +749,16 @@ void eytzinger1_test(void) inorder++; } + BUG_ON(inorder - 1 != size); } } void eytzinger0_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("0 based eytzinger test:"); + pr_info("0 based eytzinger test:\n"); for (size = 1; size < 65536; @@ -750,13 +766,7 @@ void eytzinger0_test(void) unsigned extra = eytzinger0_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); - BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); - - BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); - BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + pr_info("tree size %u\n", size); inorder = 0; eytzinger0_for_each(eytz, size) { @@ -767,54 +777,191 @@ void eytzinger0_test(void) inorder++; } + BUG_ON(inorder != size); + + inorder = size - 1; + eytzinger0_for_each_prev(eytz, size) { + BUG_ON(eytz != eytzinger0_first(size) && + eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); + + inorder--; + } + BUG_ON(inorder != -1); } } -static inline int cmp_u16(const void *_l, const void *_r, size_t size) +static inline int cmp_u16(const void *_l, const void *_r) { const u16 *l = _l, *r = _r; - return (*l > *r) - (*r - *l); + return (*l > *r) - (*r > *l); } -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) { - int i, c1 = -1, c2 = -1; - ssize_t r; + int r, s; + bool bad; r = eytzinger0_find_le(test_array, nr, sizeof(test_array[0]), cmp_u16, &search); - if (r >= 0) - c1 = test_array[r]; - - for (i = 0; i < nr; i++) - if (test_array[i] <= search && test_array[i] > c2) - c2 = test_array[i]; - - if (c1 != c2) { - eytzinger0_for_each(i, nr) - pr_info("[%3u] = %12u", i, test_array[i]); - pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", - i, r, c1, c2); + if (r >= 0) { + if (test_array[r] > search) { + bad = true; + } else { + s = eytzinger0_next(r, nr); + bad = s >= 0 && test_array[s] <= search; + } + } else { + s = eytzinger0_last(nr); + bad = s >= 0 && test_array[s] <= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each_prev(j, nr) { + if (test_array[j] <= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_le(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); } } +static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_gt(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] <= search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] > search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] > search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] > search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_gt(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_ge(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] < search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] >= search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] >= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] >= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_ge(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) +{ + unsigned r; + int s; + bool bad; + + r = eytzinger0_find(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + + if (r < nr) { + bad = test_array[r] != search; + } else { + s = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + bad = s >= 0 && test_array[s] == search; + } + + if (bad) { + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find(%12u) = %3i is incorrect\n", + search, r); + BUG(); + } +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + eytzinger0_find_test_le(test_array, nr, search); + eytzinger0_find_test_gt(test_array, nr, search); + eytzinger0_find_test_ge(test_array, nr, search); + eytzinger0_find_test_eq(test_array, nr, search); +} + void eytzinger0_find_test(void) { unsigned i, nr, allocated = 1 << 12; u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); for (nr = 1; nr < allocated; nr++) { - pr_info("testing %u elems", nr); + u16 prev = 0; + + pr_info("testing %u elems\n", nr); get_random_bytes(test_array, nr * sizeof(test_array[0])); eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); /* verify array is sorted correctly: */ - eytzinger0_for_each(i, nr) - BUG_ON(i != eytzinger0_last(nr) && - test_array[i] > test_array[eytzinger0_next(i, nr)]); + eytzinger0_for_each(j, nr) { + BUG_ON(test_array[j] < prev); + prev = test_array[j]; + } for (i = 0; i < U16_MAX; i += 1 << 12) eytzinger0_find_test_val(test_array, nr, i); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index f4a4783219d9..1e94f89aabed 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -94,6 +94,7 @@ do { \ #define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) #define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) #define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) #define prt_newline(_out) bch2_prt_newline(_out) @@ -406,6 +407,18 @@ u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { @@ -419,7 +432,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -496,7 +509,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index aed7c6984173..f9667b944c0d 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -523,7 +523,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, opt_id, v); + ret = bch2_opt_check_may_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf..584fa89bc877 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz, } #define NOTE_DATA_SZ SZ_1K -#define GNU_PROPERTY_TYPE_0_NAME "GNU" -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) @@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, @@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, "CORE", NT_FILE, size, data); + fill_note(note, NN_FILE, NT_FILE, size, data); return 0; } @@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); @@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); @@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); @@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c13ee8180b17..9133f3827f90 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* deal with each load segment separately */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { - unsigned long maddr, disp, excess, excess1; + unsigned long maddr, disp, excess; int prot = 0, flags; if (phdr->p_type != PT_LOAD) @@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * extant in the file */ excess = phdr->p_memsz - phdr->p_filesz; - excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU + unsigned long excess1 + = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; @@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); @@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 3fe9f59ef867..08412532db1b 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -2,10 +2,12 @@ /* Copyright (c) 2024 Google LLC. */ #include <linux/bpf.h> +#include <linux/bpf_lsm.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/dcache.h> #include <linux/fs.h> +#include <linux/fsnotify.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/xattr.h> @@ -93,6 +95,24 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) return len; } +static bool match_security_bpf_prefix(const char *name__str) +{ + return !strncmp(name__str, XATTR_NAME_BPF_LSM, XATTR_NAME_BPF_LSM_LEN); +} + +static int bpf_xattr_read_permission(const char *name, struct inode *inode) +{ + if (WARN_ON(!inode)) + return -EINVAL; + + /* Allow reading xattr with user. and security.bpf. prefix */ + if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + !match_security_bpf_prefix(name)) + return -EPERM; + + return inode_permission(&nop_mnt_idmap, inode, MAY_READ); +} + /** * bpf_get_dentry_xattr - get xattr of a dentry * @dentry: dentry to get xattr from @@ -101,9 +121,10 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) * * Get xattr *name__str* of *dentry* and store the output in *value_ptr*. * - * For security reasons, only *name__str* with prefix "user." is allowed. + * For security reasons, only *name__str* with prefixes "user." or + * "security.bpf." are allowed. * - * Return: 0 on success, a negative value on error. + * Return: length of the xattr value on success, a negative value on error. */ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str, struct bpf_dynptr *value_p) @@ -114,18 +135,12 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st void *value; int ret; - if (WARN_ON(!inode)) - return -EINVAL; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - value_len = __bpf_dynptr_size(value_ptr); value = __bpf_dynptr_data_rw(value_ptr, value_len); if (!value) return -EINVAL; - ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); + ret = bpf_xattr_read_permission(name__str, inode); if (ret) return ret; return __vfs_getxattr(dentry, inode, name__str, value, value_len); @@ -139,9 +154,10 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st * * Get xattr *name__str* of *file* and store the output in *value_ptr*. * - * For security reasons, only *name__str* with prefix "user." is allowed. + * For security reasons, only *name__str* with prefixes "user." or + * "security.bpf." are allowed. * - * Return: 0 on success, a negative value on error. + * Return: length of the xattr value on success, a negative value on error. */ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, struct bpf_dynptr *value_p) @@ -154,6 +170,160 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, __bpf_kfunc_end_defs(); +static int bpf_xattr_write_permission(const char *name, struct inode *inode) +{ + if (WARN_ON(!inode)) + return -EINVAL; + + /* Only allow setting and removing security.bpf. xattrs */ + if (!match_security_bpf_prefix(name)) + return -EPERM; + + return inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); +} + +/** + * bpf_set_dentry_xattr_locked - set a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * @value_p: xattr value + * @flags: flags to pass into filesystem operations + * + * Set xattr *name__str* of *dentry* to the value in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller already locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags) +{ + + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + struct inode *inode = d_inode(dentry); + const void *value; + u32 value_len; + int ret; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data(value_ptr, value_len); + if (!value) + return -EINVAL; + + ret = bpf_xattr_write_permission(name__str, inode); + if (ret) + return ret; + + ret = __vfs_setxattr(&nop_mnt_idmap, dentry, inode, name__str, + value, value_len, flags); + if (!ret) { + fsnotify_xattr(dentry); + + /* This xattr is set by BPF LSM, so we do not call + * security_inode_post_setxattr. Otherwise, we would + * risk deadlocks by calling back to the same kfunc. + * + * This is the same as security_inode_setsecurity(). + */ + } + return ret; +} + +/** + * bpf_remove_dentry_xattr_locked - remove a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * + * Rmove xattr *name__str* of *dentry*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller already locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str) +{ + struct inode *inode = d_inode(dentry); + int ret; + + ret = bpf_xattr_write_permission(name__str, inode); + if (ret) + return ret; + + ret = __vfs_removexattr(&nop_mnt_idmap, dentry, name__str); + if (!ret) { + fsnotify_xattr(dentry); + + /* This xattr is removed by BPF LSM, so we do not call + * security_inode_post_removexattr. Otherwise, we would + * risk deadlocks by calling back to the same kfunc. + */ + } + return ret; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_set_dentry_xattr - set a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * @value_p: xattr value + * @flags: flags to pass into filesystem operations + * + * Set xattr *name__str* of *dentry* to the value in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller has not locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags) +{ + struct inode *inode = d_inode(dentry); + int ret; + + inode_lock(inode); + ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags); + inode_unlock(inode); + return ret; +} + +/** + * bpf_remove_dentry_xattr - remove a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * + * Rmove xattr *name__str* of *dentry*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller has not locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str) +{ + struct inode *inode = d_inode(dentry); + int ret; + + inode_lock(inode); + ret = bpf_remove_dentry_xattr_locked(dentry, name__str); + inode_unlock(inode); + return ret; +} + +__bpf_kfunc_end_defs(); + BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) @@ -161,6 +331,8 @@ BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) @@ -171,6 +343,37 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) return -EACCES; } +/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and + * KF_SLEEPABLE, so they are only available to sleepable hooks with + * dentry arguments. + * + * Setting and removing xattr requires exclusive lock on dentry->d_inode. + * Some hooks already locked d_inode, while some hooks have not locked + * d_inode. Therefore, we need different kfuncs for different hooks. + * Specifically, hooks in the following list (d_inode_locked_hooks) + * should call bpf_[set|remove]_dentry_xattr_locked; while other hooks + * should call bpf_[set|remove]_dentry_xattr. + */ +BTF_SET_START(d_inode_locked_hooks) +BTF_ID(func, bpf_lsm_inode_post_removexattr) +BTF_ID(func, bpf_lsm_inode_post_setattr) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_removexattr) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_unlink) +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, bpf_lsm_path_unlink) +BTF_ID(func, bpf_lsm_path_rmdir) +#endif /* CONFIG_SECURITY_PATH */ +BTF_SET_END(d_inode_locked_hooks) + +bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&d_inode_locked_hooks, prog->aux->attach_btf_id); +} + static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_fs_kfunc_set_ids, diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..15ea6348800b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a4c51600a408..f3bffe08b290 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -296,7 +295,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +306,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3d3923cfc357..5936cff80ff3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc2555c44a12..8c2eee1f1878 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } @@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0a8f7d92acc..a8129f1ce78c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new, /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); exist = rb_find_add_cached(&block_group->cache_node, - &info->block_group_cache_tree, btrfs_bg_start_cmp); + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist) ret = -EEXIST; - write_unlock(&info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); return ret; } @@ -584,7 +584,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -626,7 +626,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -738,8 +737,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -785,8 +784,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -1457,6 +1456,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1571,8 +1596,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1823,7 +1847,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1887,6 +1912,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1903,31 +1939,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", + "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", bg->start, - div64_u64(bg->used * 100, bg->length), + div64_u64(used * 100, bg->length), + div64_u64(reserved * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1936,24 +1988,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1993,13 +2034,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2410,7 +2446,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; @@ -2459,7 +2495,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); -out: - btrfs_free_path(path); + return ret; } @@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2888,7 +2926,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -3306,7 +3344,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3323,7 +3361,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3346,7 +3383,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3501,7 +3538,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3512,7 +3548,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3624,7 +3660,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..4e2952cf5766 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,6 +517,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -524,7 +533,7 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -584,9 +593,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..e7f8ee5d48a4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = min(max(level, ops->min_level), ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1590,18 +1598,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..df198623cc08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -72,16 +72,6 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) { @@ -93,7 +83,8 @@ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +98,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -131,14 +122,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +179,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3dc5a35dd19b..a2e7979372cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4306,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4320,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4608,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4644,7 +4642,8 @@ again: goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4668,8 +4667,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4679,11 +4678,10 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; + /* Save our key for returning back. */ + btrfs_node_key_to_cpu(cur, min_key, slot); goto out; } cur = btrfs_read_node_slot(cur, slot); @@ -4700,10 +4698,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { + if (ret == 0) btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..075a06db43a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,7 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..d4310d93f532 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -225,7 +225,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +250,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -1352,17 +1352,18 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1377,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } } if (extent_thresh == 0) @@ -1402,8 +1414,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1432,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1476,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1474,9 +1488,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b4933c6a889..3f1551d8a5c6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1211,7 +1211,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1238,7 +1238,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1817,53 +1816,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1876,39 +1875,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1928,8 +1926,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1937,7 +1934,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..c4b4ba122beb 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a35067cebb97..f5ae880308d3 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f86fbea0b3de..53d7d85cb4be 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ccf91de29f80..b29cc31a7c4a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..8462579a95f4 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,6 +10,7 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 8567af46e16f..a374ce7a1813 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,7 +248,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); block_start = extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -855,6 +856,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..d6eef4bd9e9d 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -260,9 +254,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..1a916716cefe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,13 +182,12 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, @@ -284,8 +283,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -1089,21 +1087,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1127,7 +1126,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1136,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1144,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -2200,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2210,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2447,21 +2442,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2561,6 +2562,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + ret = validate_sys_chunk_array(fs_info, sb); /* @@ -3390,7 +3394,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; @@ -3416,11 +3419,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -4326,6 +4324,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4346,6 +4352,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4369,6 +4400,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); @@ -4403,9 +4436,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4528,10 +4558,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4902,7 +4928,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4918,14 +4944,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4936,10 +4961,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..13de6af279e5 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -346,10 +346,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { btrfs_panic(extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3014a1a23efd..957230abd827 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1679,8 +1674,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1688,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1701,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } @@ -2348,8 +2342,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) @@ -2874,7 +2868,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) block_group->length, &trimmed); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -5465,7 +5467,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5482,7 +5484,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5517,7 +5518,6 @@ again: mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -6285,7 +6285,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6298,10 +6298,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6338,7 +6336,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index cfa52264f678..0ed682d9ed7b 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..197f5e51c474 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -425,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -488,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, { struct btrfs_inode *inode = folio_to_inode(folio); - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && @@ -836,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -870,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -887,8 +887,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -935,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -955,25 +951,29 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); @@ -985,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1042,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1064,15 +1059,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1083,7 +1253,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -1105,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1118,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1157,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1184,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1264,7 +1435,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1309,7 +1480,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1324,7 +1495,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1333,7 +1504,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1434,6 +1605,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1442,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1530,6 +1704,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1551,6 +1726,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1562,14 +1761,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1725,20 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock( static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); @@ -1792,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } @@ -1849,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { + while (bit_start < blocks_per_folio) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1867,7 +2043,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&folio->mapping->i_private_lock); @@ -1933,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct if (!folio_test_private(folio)) return 0; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); @@ -2192,10 +2368,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2233,7 +2407,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2314,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2325,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2390,7 +2569,7 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); @@ -2443,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { @@ -2481,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2592,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -2618,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2629,7 +2808,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); spin_unlock(&folio->mapping->i_private_lock); } @@ -2662,15 +2841,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2679,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } @@ -2687,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2707,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(src); i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); @@ -2723,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return new; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto out; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto out_detach; } set_extent_buffer_uptodate(eb); @@ -2750,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { + +out_detach: + for (int i = 0; i < num_extent_folios(eb); i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); folio_put(eb->folios[i]); } } +out: kmem_cache_free(extent_buffer_cache, eb); return NULL; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); -} - static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -2805,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2842,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2881,8 +3050,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2896,7 +3068,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2999,7 +3171,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3041,8 +3213,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3070,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3090,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3106,9 +3276,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3148,7 +3317,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3159,7 +3328,7 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* @@ -3202,7 +3371,7 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_unlock(eb->folios[i]); return eb; @@ -3233,7 +3402,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. + * so it can be cleaned up without utilizing folio->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3333,11 +3502,10 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) __xa_clear_mark(&folio->mapping->i_pages, @@ -3345,26 +3513,10 @@ static void btree_clear_folio_dirty(struct folio *folio) xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3391,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3409,37 +3560,34 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3447,54 +3595,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3507,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3532,19 +3654,6 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } - clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3555,7 +3664,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3595,19 +3703,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3796,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4282,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..2e261892c7bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -252,8 +252,6 @@ void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -276,7 +274,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -294,8 +293,10 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { if (folio_order(eb->folios[0])) return 1; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d04a3b47b1fb..344b4db487a0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); -out: - btrfs_free_path(path); + return ret; } @@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -341,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; @@ -373,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); + if (!bbio->csum) return BLK_STS_RESOURCE; - } } else { bbio->csum = bbio->csum_inline; } @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1074,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..6181a70ec3ef 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0b568c8d24cb..262a707d8990 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -874,7 +875,6 @@ again: ret = PTR_ERR(folio); return ret; } - folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); @@ -1014,8 +1014,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1783,6 +1782,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; + size_t fsize = folio_size(folio); vm_fault_t ret; int ret2; int reserved = 0; @@ -1793,7 +1793,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ASSERT(folio_order(folio) == 0); - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1847,7 +1847,7 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); @@ -1859,11 +1859,11 @@ again: if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + fsize - reserved_space, true); } } @@ -1890,12 +1890,12 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); @@ -1904,7 +1904,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -1913,7 +1913,7 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d42b6f882f57..05e173311c1a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } - ret = set_folio_extent_mapped(page_folio(page)); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1156,8 +1155,8 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..39c6b96a4c25 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1062,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1217,7 +1215,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1233,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1242,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1638,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1657,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1670,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 09cfb43580cb..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" #include "volumes.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..bcca43046064 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -485,8 +500,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -709,7 +724,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -981,6 +995,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 448aa1a682d6..3530de0618c8 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -191,8 +191,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -317,8 +317,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -493,8 +493,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9e56c994e9e..cc67d1a2d611 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -672,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -832,7 +823,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct folio *folio; @@ -840,13 +831,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -886,6 +877,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -894,7 +886,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -968,13 +960,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1090,7 +1084,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); @@ -1272,10 +1265,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1492,11 +1482,9 @@ out_unlock: /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). + * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV - * are also handled by the cleanup function. + * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. @@ -1507,6 +1495,8 @@ out_unlock: if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } @@ -1976,6 +1966,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2020,15 +2069,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2154,75 +2200,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2231,11 +2223,11 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; if (ret) { cow_end = end; goto error; } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2249,27 +2241,44 @@ error: * start cur_offset end * |/////////////| | * + * In this case, cow_start should be (u64)-1. + * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. - * Only need to clear the dirty flag as they will never be submitted. - * Ordered extent and extent maps are handled by - * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. * - * 2) Failed with error from fallback_to_cow() - * start cur_offset cow_end end + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end * |/////////////|-----------| | * - * For range [start, cur_offset) it's the same as case 1). - * But for range [cur_offset, cow_end), the folios have dirty flag - * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * In this case, both @cow_start and @cow_end is set. * - * Thus we should not call extent_clear_unlock_delalloc() on range - * [cur_offset, cow_end), as the folios are already unlocked. + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. * - * So clear the folio dirty flags for [start, cur_offset) first. + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ - if (cur_offset > start) + if (cow_start != (u64)-1) + cur_offset = cow_start; + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } /* * If an error happened while a COW region is outstanding, cur_offset @@ -2334,7 +2343,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2348,10 +2357,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2878,6 +2883,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2896,7 +2916,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2952,8 +2972,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2988,8 +3008,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -3407,6 +3427,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3527,7 +3548,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3546,6 +3566,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3669,10 +3691,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3695,7 +3717,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3845,12 +3867,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3859,7 +3882,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3869,7 +3892,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3889,41 +3912,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3935,9 +3958,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3966,7 +3988,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3974,15 +3996,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3990,13 +4012,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -4004,50 +4025,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -5602,7 +5622,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5614,40 +5634,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5655,7 +5677,7 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5667,43 +5689,46 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5717,15 +5742,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5742,18 +5767,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5768,19 +5793,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -6253,7 +6281,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6339,6 +6367,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6432,7 +6461,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6442,11 +6471,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6544,7 +6575,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6764,6 +6795,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6780,7 +6812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6792,14 +6824,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6814,14 +6847,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -7062,17 +7095,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7082,8 +7115,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) goto out; @@ -7098,7 +7131,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; @@ -7119,7 +7152,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.end = offset + *len - 1; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; @@ -7135,7 +7168,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.file_extent.offset)) goto out; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; @@ -7240,7 +7273,7 @@ static void wait_subpage_spinlock(struct folio *folio) struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7264,7 +7297,7 @@ static void wait_subpage_spinlock(struct folio *folio) static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8499,8 +8532,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8511,30 +8542,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8542,8 +8573,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8660,7 +8691,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8699,8 +8735,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -9146,7 +9182,7 @@ out: } struct btrfs_encoded_read_private { - struct completion done; + struct completion *sync_reads; void *uring_ctx; refcount_t pending_refs; blk_status_t status; @@ -9158,11 +9194,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); @@ -9174,7 +9209,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - complete(&priv->done); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9185,16 +9220,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_completion(&priv->done); refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9237,11 +9282,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { if (!refcount_dec_and_test(&priv->pending_refs)) - wait_for_completion_io(&priv->done); + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c18bad53cd3..a13d81bb56a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1336,15 +1335,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1447,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline int key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1473,7 +1472,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1604,13 +1603,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2571,7 +2570,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2763,7 +2762,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2817,7 +2816,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -4248,7 +4247,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4384,7 +4383,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4415,7 +4414,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -5242,7 +5241,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5264,9 +5263,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5314,10 +5313,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ce915fcda43b..e08ea446cf48 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -9,6 +9,8 @@ struct file; struct dentry; struct mnt_idmap; struct fileattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; @@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..81e62b652e21 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..03c945711003 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f9d3766c809b..d6fa36674270 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..15c296cb4dac 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -165,7 +165,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +175,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +185,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +205,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +214,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +236,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,7 +270,7 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -281,9 +281,9 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); out: if (!ret && !trans) { /* @@ -318,7 +318,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +526,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +617,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +646,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +679,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +689,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -775,24 +777,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +813,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +834,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +863,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +872,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +884,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af0969b70b53..f948f4f6431c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3239,21 +3239,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3313,8 +3315,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3761,10 +3762,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3792,18 +3793,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); + if (inode) + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3977,7 +3979,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -3986,7 +3988,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..2c5edcee9450 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1380,11 +1380,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2497,8 +2497,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..0c8c58c4f29b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -16,7 +16,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +177,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -826,7 +823,7 @@ static int send_rename(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +831,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -851,7 +847,7 @@ static int send_link(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +855,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -875,14 +870,13 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -898,14 +892,13 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1897,7 +1890,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1901,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1946,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1959,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2315,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2327,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2348,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2371,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2390,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2429,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2488,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,6 +2588,47 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2599,17 +2637,14 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,7 +2652,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2629,17 +2664,14 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,7 +2679,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2662,17 +2694,14 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,7 +2709,7 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2693,17 +2722,14 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", ino, uid, gid); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,7 +2738,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2729,9 +2755,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) btrfs_debug(fs_info, "send_utimes %llu", ino); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2782,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2793,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -3106,6 +3129,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4186,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,9 +4217,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; @@ -4216,14 +4261,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4489,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4547,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4553,6 +4593,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4701,7 +4743,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4752,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4766,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4781,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4795,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4806,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4819,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4905,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4926,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4933,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4945,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4952,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4975,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_remove_xattr(sctx, p, name, name_len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5111,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5166,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5181,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5211,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5343,31 +5343,25 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); - if (ret < 0) - goto out; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5380,6 +5374,7 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; btrfs_debug(sctx->send_root->fs_info, @@ -5387,6 +5382,10 @@ static int send_clone(struct send_ctx *sctx, offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -5395,13 +5394,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5452,17 +5447,13 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); @@ -5471,8 +5462,6 @@ static int send_update_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5501,12 +5490,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5527,7 +5514,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5535,9 +5521,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5546,23 +5530,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5578,12 +5552,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5591,9 +5565,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5602,7 +5573,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5617,9 +5588,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5627,10 +5598,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5672,7 +5639,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5698,8 +5665,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5741,15 +5707,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5828,7 +5793,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5854,25 +5818,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6898,6 +6856,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -8107,10 +8066,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8173,6 +8131,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8449,6 +8408,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a341d087567a..ff089e3e4103 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 722acf768396..11dbd7be6a3b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -64,35 +63,14 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) -{ - if (fs_info->sectorsize >= PAGE_SIZE) - return false; - - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + ASSERT(!folio_test_large(folio)); + /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. @@ -101,10 +79,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); + subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); if (IS_ERR(subpage)) return PTR_ERR(subpage); @@ -112,12 +94,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_detach_private(folio); @@ -126,15 +113,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol } struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) + size_t fsize, enum btrfs_subpage_type type) { struct btrfs_subpage *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -165,7 +153,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -179,7 +167,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -206,16 +194,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,7 +223,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + folio_size(folio), orig_start + orig_len) - *start; } @@ -296,7 +286,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -323,13 +313,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -341,7 +332,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, } spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } @@ -352,15 +343,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -372,7 +375,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -426,7 +429,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -467,7 +470,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } @@ -498,7 +501,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -513,7 +516,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -569,7 +572,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +582,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +592,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +600,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +611,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +622,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,26 +660,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const int sectors_per_page = fs_info->sectors_per_page; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ + ASSERT(blocks_per_folio < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ - GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ - fs_info->sectors_per_page, &bitmap); \ + blocks_per_folio, &bitmap); \ } /* @@ -672,7 +700,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } @@ -707,7 +735,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_get_private(folio); @@ -721,15 +749,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&subpage->lock, flags); } +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; +} + void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -739,28 +789,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); + ASSERT(blocks_per_folio > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &locked_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, @@ -771,10 +821,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 44fff1f4eac4..3042c5ea840a 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -69,23 +70,49 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct folio *folio) +{ + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); +} #else +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return false; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return false; } #endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); + size_t fsize, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc4fee519ca6..40709e2a44fc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -947,7 +947,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int err; @@ -982,7 +982,7 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { err = -ENOMEM; goto fail_close; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 14f53f757555..b9af74498b0c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1342,17 +1343,18 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3fc5c6f90dc4..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..74aca7180a5a 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -525,7 +525,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +542,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -730,7 +730,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..609bb6c9c087 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aca83a98b75a..f26a394a9ec5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -1635,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1661,7 +1667,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1690,8 +1696,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1699,16 +1705,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1716,7 +1722,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1817,7 +1823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1855,18 +1861,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2096,7 +2102,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..90dc094cfa5e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; /* * We're holding a transaction handle whether we are logging or @@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +511,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +525,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +544,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,30 +561,28 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: btrfs_release_path(path); @@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) +static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, + u64 objectid) { - struct inode *inode; + struct btrfs_inode *inode; inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) - inode = NULL; + return NULL; return inode; } @@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -1148,7 +1134,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1188,10 +1174,10 @@ again: btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); } - iput(victim_parent); + iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1325,7 +1311,7 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); dir = read_one_inode(root, parent_id); @@ -1334,10 +1320,9 @@ again: kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1611,25 +1595,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) @@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; inode = read_one_inode(root, location->objectid); @@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, dir = read_one_inode(root, dirid); if (!dir) { - iput(inode); + iput(&inode->vfs_inode); return -EIO; } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; @@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2184,7 +2167,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; inode = read_one_inode(root, key.objectid); @@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = -EIO; break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5497,7 +5478,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5524,17 +5505,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5576,14 +5556,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6789,7 +6767,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6838,18 +6816,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6874,7 +6850,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6889,11 +6865,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7061,26 +7036,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; /* * Skip already logged inodes or inodes corresponding to tmpfiles @@ -7089,14 +7058,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if ((btrfs_inode_in_log(inode, trans->transid) && list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + inode->vfs_inode.i_nlink == 0) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7115,8 +7082,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7171,8 +7141,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7182,7 +7150,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7247,8 +7215,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f8afbd1ebb5..c8c21c55be53 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1798,8 +1798,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1918,8 +1918,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2721,8 +2721,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -3119,8 +3119,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3577,8 +3577,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -4184,8 +4184,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -5001,8 +5001,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -7539,8 +7539,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 120f65e21eeb..e247d551da67 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,17 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c9e92c6941ec..545f413d81fc 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,47 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + /* No large folio support yet. */ + ASSERT(!folio_test_large(folio)); + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = min(bytes_left, - in_buf_folios << PAGE_SHIFT); + workspace->strm.avail_in = copy_length; } else { unsigned int pg_off; unsigned int cur_len; @@ -463,6 +489,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73e0aa9fc08a..fb8b8b29c169 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2111,6 +2111,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2272,6 +2275,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2325,6 +2331,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..cd5f38d6fbaa 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -26,11 +26,12 @@ #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +722,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/fs/coredump.c b/fs/coredump.c index d6a92cd6018e..c33c177a701b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1042,7 +1042,9 @@ static const struct ctl_table coredump_sysctls[] = { .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "core_file_note_size_limit", diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5aff5934baa1..b5dfb0aa405a 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -3,6 +3,7 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_HASH + select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 select KEYS @@ -24,20 +25,16 @@ config FS_ENCRYPTION # # Also note that this option only pulls in the generic implementations of the # algorithms, not any per-architecture optimized implementations. It is -# strongly recommended to enable optimized implementations too. It is safe to -# disable these generic implementations if corresponding optimized -# implementations will always be available too; for this reason, these are soft -# dependencies ('imply' rather than 'select'). Only disable these generic -# implementations if you're sure they will never be needed, though. +# strongly recommended to enable optimized implementations too. config FS_ENCRYPTION_ALGS tristate - imply CRYPTO_AES - imply CRYPTO_CBC - imply CRYPTO_CTS - imply CRYPTO_ECB - imply CRYPTO_HMAC - imply CRYPTO_SHA512 - imply CRYPTO_XTS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_ECB + select CRYPTO_HMAC + select CRYPTO_SHA512 + select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 5a384dad2c72..855a0f4b7318 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC @@ -11,6 +7,7 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +#include <crypto/hkdf.h> #include "fscrypt_private.h" @@ -44,20 +41,6 @@ * there's no way to persist a random salt per master key from kernel mode. */ -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) -{ - static const u8 default_salt[HKDF_HASHLEN]; - int err; - - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); - if (err) - return err; - - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); -} - /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. @@ -69,6 +52,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; + static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; int err; @@ -84,7 +68,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, goto err_free_tfm; } - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + err = hkdf_extract(hmac_tfm, master_key, master_key_size, + default_salt, HKDF_HASHLEN, prk); if (err) goto err_free_tfm; @@ -118,61 +103,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 prefix[9]; - unsigned int i; + u8 *full_info; int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HKDF_HASHLEN]; - - if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) - return -EINVAL; + full_info = kzalloc(infolen + 9, GFP_KERNEL); + if (!full_info) + return -ENOMEM; desc->tfm = hkdf->hmac_tfm; - memcpy(prefix, "fscrypt\0", 8); - prefix[8] = context; - - for (i = 0; i < okmlen; i += HKDF_HASHLEN) { - - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, HKDF_HASHLEN); - if (err) - goto out; - } - - err = crypto_shash_update(desc, prefix, sizeof(prefix)); - if (err) - goto out; - - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < HKDF_HASHLEN) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); + memcpy(full_info, "fscrypt\0", 8); + full_info[8] = context; + memcpy(full_info + 9, info, infolen); + + err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, + okm, okmlen); + kfree_sensitive(full_info); return err; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 40de69860dcf..7fa53d30aec3 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -130,6 +130,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); + crypto_cfg.key_type = BLK_CRYPTO_KEY_TYPE_RAW; devs = fscrypt_get_devices(sb, &num_devs); if (IS_ERR(devs)) @@ -166,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, if (!blk_key) return -ENOMEM; - err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, ci->ci_mode->keysize, + BLK_CRYPTO_KEY_TYPE_RAW, crypto_mode, fscrypt_get_dun_bytes(ci), 1U << ci->ci_data_unit_bits); if (err) { diff --git a/fs/dcache.c b/fs/dcache.c index 623947d6a676..bd5aa136153a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -73,8 +73,13 @@ * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ -int sysctl_vfs_cache_pressure __read_mostly = 100; -EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +static int sysctl_vfs_cache_pressure __read_mostly = 100; + +unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, 100); +} +EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); @@ -211,8 +216,20 @@ static const struct ctl_table fs_dcache_sysctls[] = { }, }; +static const struct ctl_table vm_dcache_sysctls[] = { + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + static int __init init_fs_dcache_sysctls(void) { + register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } diff --git a/fs/dlm/config.h b/fs/dlm/config.h index e48c4f9686d3..13a3d0b26194 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -23,7 +23,7 @@ struct dlm_config_node { extern const struct rhashtable_params dlm_rhash_rsb_params; -#define DLM_MAX_ADDR_COUNT 3 +#define DLM_MAX_ADDR_COUNT 8 #define DLM_PROTO_TCP 0 #define DLM_PROTO_SCTP 1 diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c8ff88f1cdcf..e01d5f29f4d2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -741,6 +741,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, read_lock_bh(&ls->ls_rsbtbl_lock); if (!rsb_flag(r, RSB_HASHED)) { read_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } @@ -784,6 +785,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, } } else { write_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8afac6e2dff0..1929327ffbe1 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -576,7 +576,7 @@ static int new_lockspace(const char *name, const char *cluster, lockspace to start running (via sysfs) in dlm_ls_start(). */ error = do_uevent(ls, 1); - if (error) + if (error < 0) goto out_recoverd; /* wait until recovery is successful or failed */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d28141829c05..70abd4da17a6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1826,8 +1826,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d45ef541d848..019a8b4eaaf9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -14,7 +14,7 @@ #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ -int sysctl_drop_caches; +static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { @@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -int drop_caches_sysctl_handler(const struct ctl_table *table, int write, +static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write, } return 0; } + +static const struct ctl_table drop_caches_table[] = { + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, + }, +}; + +static int __init init_vm_drop_caches_sysctls(void) +{ + register_sysctl_init("vm", drop_caches_table); + return 0; +} +fs_initcall(init_vm_drop_caches_sysctls); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index c294a8fc566d..cb1b6d0c3454 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -57,11 +57,10 @@ static ssize_t efivarfs_file_write(struct file *file, if (bytes == -ENOENT) { /* - * FIXME: temporary workaround for fwupdate, signal - * failed write with a 1 to keep created but not - * written files + * zero size signals to release that the write deleted + * the variable */ - i_size_write(inode, 1); + i_size_write(inode, 0); } else { i_size_write(inode, datasize + sizeof(attributes)); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -125,8 +124,7 @@ static int efivarfs_file_release(struct inode *inode, struct file *file) struct efivar_entry *var = inode->i_private; inode_lock(inode); - /* FIXME: temporary work around for fwupdate */ - var->removed = (--var->open_count == 0 && i_size_read(inode) == 1); + var->removed = (--var->open_count == 0 && i_size_read(inode) == 0); inode_unlock(inode); if (var->removed) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6ea60661fa55..331e49cd1b8d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -13,12 +13,12 @@ config EROFS_FS smartphones with Android OS, LiveCDs and high-density hosts with numerous containers; - It also provides fixed-sized output compression support in order to - improve storage density as well as keep relatively higher compression - ratios and implements in-place decompression to reuse the file page - for compressed data temporarily with proper strategies, which is - quite useful to ensure guaranteed end-to-end runtime decompression - performance under extremely memory pressure without extra cost. + It also provides transparent compression and deduplication support to + improve storage density and maintain relatively high compression + ratios, and it implements in-place decompression to temporarily reuse + page cache for compressed data using proper strategies, which is + quite useful for ensuring guaranteed end-to-end runtime decompression + performance under extreme memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> and the web pages at <https://erofs.docs.kernel.org> for more details. @@ -97,7 +97,7 @@ config EROFS_FS_ZIP select LZ4_DECOMPRESS default y help - Enable fixed-sized output compression for EROFS. + Enable transparent compression support for EROFS file systems. If you don't want to enable compression feature, say N. diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 65ff39401020..2704d7a592a5 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,6 +11,7 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; + unsigned int inpages, outpages; unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; @@ -59,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[]; struct z_erofs_stream_dctx { struct z_erofs_decompress_req *rq; - unsigned int inpages, outpages; /* # of {en,de}coded pages */ int no, ni; /* the current {en,de}coded page # */ unsigned int avail_out; /* remaining bytes in the decoded buffer */ diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0cd6b5c4df98..2409d2ab0c28 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -25,8 +25,7 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap) { pgoff_t index = offset >> PAGE_SHIFT; struct folio *folio = NULL; @@ -43,10 +42,10 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, return folio; } buf->page = folio_file_page(folio, index); - if (!buf->base && type == EROFS_KMAP) - buf->base = kmap_local_page(buf->page); - if (type == EROFS_NO_KMAP) + if (!need_kmap) return NULL; + if (!buf->base) + buf->base = kmap_local_page(buf->page); return buf->base + (offset & ~PAGE_MASK); } @@ -65,64 +64,47 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type) + erofs_off_t offset, bool need_kmap) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, offset, type); -} - -static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map) -{ - struct erofs_inode *vi = EROFS_I(inode); - struct super_block *sb = inode->i_sb; - bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - - map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ - if (map->m_la < erofs_pos(sb, lastblk)) { - map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - map->m_la; - } else { - DBG_BUGON(!tailendpacking); - map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, map->m_la); - map->m_plen = inode->i_size - map->m_la; - - /* inline data should be located in the same meta block */ - if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - map->m_flags |= EROFS_MAP_META; - } - return 0; + return erofs_bread(buf, offset, need_kmap); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = inode->i_sb; + unsigned int unit, blksz = sb->s_blocksize; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u64 chunknr; - unsigned int unit; + erofs_blk_t startblk, addrmask; + bool tailpacking; erofs_off_t pos; - void *kaddr; + u64 chunknr; int err = 0; trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; - if (map->m_la >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = map->m_llen; + map->m_flags = 0; + if (map->m_la >= inode->i_size) goto out; - } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map); + tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + if (!tailpacking && vi->startblk == EROFS_NULL_ADDR) + goto out; + pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking); + + map->m_flags = EROFS_MAP_MAPPED; + if (map->m_la < pos) { + map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la; + map->m_llen = pos - map->m_la; + } else { + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_llen = inode->i_size - map->m_la; + map->m_flags |= EROFS_MAP_META; + } goto out; } @@ -135,45 +117,44 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + idx = erofs_read_metabuf(&buf, sb, pos, true); + if (IS_ERR(idx)) { + err = PTR_ERR(idx); goto out; } map->m_la = chunknr << vi->chunkbits; - map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - round_up(inode->i_size - map->m_la, sb->s_blocksize)); - - /* handle block map */ - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr; - - if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { - map->m_flags = 0; - } else { - map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); + map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, blksz)); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { + addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ? + BIT_ULL(48) - 1 : BIT_ULL(32) - 1; + startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) | + le32_to_cpu(idx->startblk_lo)) & addrmask; + if ((startblk ^ EROFS_NULL_ADDR) & addrmask) { + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa = erofs_pos(sb, startblk); + map->m_flags = EROFS_MAP_MAPPED; + } + } else { + startblk = le32_to_cpu(*(__le32 *)idx); + if (startblk != (u32)EROFS_NULL_ADDR) { + map->m_pa = erofs_pos(sb, startblk); map->m_flags = EROFS_MAP_MAPPED; } - goto out_unlock; - } - /* parse chunk indexes */ - idx = kaddr; - switch (le32_to_cpu(idx->blkaddr)) { - case EROFS_NULL_ADDR: - map->m_flags = 0; - break; - default: - map->m_deviceid = le16_to_cpu(idx->device_id) & - EROFS_SB(sb)->device_id_mask; - map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); - map->m_flags = EROFS_MAP_MAPPED; - break; } -out_unlock: erofs_put_metabuf(&buf); out: - if (!err) - map->m_llen = map->m_plen; + if (!err) { + map->m_plen = map->m_llen; + /* inline data should be located in the same meta block */ + if ((map->m_flags & EROFS_MAP_META) && + erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) { + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + } trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -192,7 +173,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; - erofs_off_t startoff, length; + erofs_off_t startoff; int id; erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); @@ -205,7 +186,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return -ENODEV; } if (devs->flatdev) { - map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + map->m_pa += erofs_pos(sb, dif->uniaddr); up_read(&devs->rwsem); return 0; } @@ -214,13 +195,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; - startoff = erofs_pos(sb, dif->mapped_blkaddr); - length = erofs_pos(sb, dif->blocks); + startoff = erofs_pos(sb, dif->uniaddr); if (map->m_pa >= startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; erofs_fill_from_devinfo(map, sb, dif); break; @@ -312,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2b123b070a42..bf62e2836b60 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -9,14 +9,6 @@ #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) -struct z_erofs_lz4_decompress_ctx { - struct z_erofs_decompress_req *rq; - /* # of encoded, decoded pages */ - unsigned int inpages, outpages; - /* decoded block total length (used for in-place decompression) */ - unsigned int oend; -}; - static int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { @@ -55,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb, * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -68,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, unsigned int i, j, top; top = 0; - for (i = j = 0; i < ctx->outpages; ++i, ++j) { + for (i = j = 0; i < rq->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -114,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, +static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq, void *inpage, void *out, unsigned int *inputmargin, int *maptype, bool may_inplace) { - struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i; + unsigned int oend, omargin, total, i; struct page **in; void *src, *tmp; if (rq->inplace_io) { - omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + oend = rq->pageofs_out + rq->outputsize; + omargin = PAGE_ALIGN(oend) - oend; if (rq->partial_decoding || !may_inplace || omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) - if (rq->out[ctx->outpages - ctx->inpages + i] != + for (i = 0; i < rq->inpages; ++i) + if (rq->out[rq->outpages - rq->inpages + i] != rq->in[i]) goto docopy; kunmap_local(inpage); *maptype = 3; - return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); + return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT); } - if (ctx->inpages <= 1) { + if (rq->inpages <= 1) { *maptype = 0; return inpage; } kunmap_local(inpage); - src = erofs_vm_map_ram(rq->in, ctx->inpages); + src = erofs_vm_map_ram(rq->in, rq->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; @@ -152,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = z_erofs_get_gbuf(ctx->inpages); + src = z_erofs_get_gbuf(rq->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); @@ -197,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, return 0; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *dst) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst) { - struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; @@ -224,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, + src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); @@ -251,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, if (maptype == 0) { kunmap_local(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, ctx->inpages); + vm_unmap_ram(src, rq->inpages); } else if (maptype == 2) { z_erofs_put_gbuf(src); } else if (maptype != 3) { @@ -264,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; - ctx.rq = rq; - ctx.oend = rq->pageofs_out + rq->outputsize; - ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; - ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - /* one optimized fast path only for non bigpcluster cases yet */ - if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { + if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_local_page(*rq->out); dst_maptype = 0; - goto dstmap_out; - } - - /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); - if (ret < 0) { - return ret; - } else if (ret > 0) { - dst = page_address(*rq->out); - dst_maptype = 1; } else { - dst = erofs_vm_map_ram(rq->out, ctx.outpages); - if (!dst) - return -ENOMEM; - dst_maptype = 2; + /* general decoding path which can be used for all cases */ + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); + if (ret < 0) + return ret; + if (ret > 0) { + dst = page_address(*rq->out); + dst_maptype = 1; + } else { + dst = erofs_vm_map_ram(rq->out, rq->outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; + } } - -dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst); + ret = z_erofs_lz4_decompress_mem(rq, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, ctx.outpages); + vm_unmap_ram(dst, rq->outpages); return ret; } static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int nrpages_in = - PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages; const unsigned int bs = rq->sb->s_blocksize; unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; @@ -336,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, rq->outputsize -= cur; } - for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) { insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -= insz; if (!rq->in[ni]) @@ -373,7 +350,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, unsigned int j; if (!dctx->avail_out) { - if (++dctx->no >= dctx->outpages || !rq->outputsize) { + if (++dctx->no >= rq->outpages || !rq->outputsize) { erofs_err(sb, "insufficient space for decompressed data"); return -EFSCORRUPTED; } @@ -401,7 +378,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, } if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { - if (++dctx->ni >= dctx->inpages) { + if (++dctx->ni >= rq->inpages) { erofs_err(sb, "invalid compressed data"); return -EFSCORRUPTED; } @@ -434,7 +411,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, dctx->bounced = true; } - for (j = dctx->ni + 1; j < dctx->inpages; ++j) { + for (j = dctx->ni + 1; j < rq->inpages; ++j) { if (rq->out[dctx->no] != rq->in[j]) continue; tmppage = erofs_allocpage(pgpl, rq->gfp); diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 5070d2fcc737..c6908a487054 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -101,13 +101,7 @@ static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct z_erofs_deflate *strm; int zerr, err; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 40666815046f..832cffb83a66 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct xz_buf buf = {}; struct z_erofs_lzma *strm; enum xz_ret xz_err; diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index 7e177304967e..b4bfe14229f9 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; zstd_in_buffer in_buf = { NULL, 0, 0 }; zstd_out_buffer out_buf = { NULL, 0, 0 }; struct z_erofs_zstd *strm; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index c3b90abdee37..2fae209d0274 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,9 +58,9 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dbstart, EROFS_KMAP); + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { - erofs_err(sb, "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; @@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) ofs = 0; } erofs_put_metabuf(&buf); + if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { + if (!dir_emit_dot(f, ctx)) + return 0; + ++ctx->pos; + } return err < 0 ? err : 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 199395ed1c1f..9581e9bf8192 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -30,25 +30,19 @@ #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE | \ - EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1) #define EROFS_SB_EXTSLOT_SIZE 16 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) @@ -59,13 +53,14 @@ struct erofs_super_block { __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } rb; __le64 inos; /* total valid ino # (== f_files - f_favail) */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -84,7 +79,10 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved2[23]; + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __u8 reserved2[8]; }; /* @@ -115,19 +113,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_VERSION_MASK 0x01 #define EROFS_I_DATALAYOUT_MASK 0x07 -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 -#define EROFS_I_ALL_BIT 4 - -#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) /* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 @@ -140,45 +138,40 @@ struct erofs_inode_chunk_info { }; union erofs_inode_i_u { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ struct erofs_inode_chunk_info c; }; +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; + __le32 i_mtime; union erofs_inode_i_u i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; union erofs_inode_i_u i_u; @@ -248,6 +241,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) if (!i_xattr_icount) return 0; + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -266,11 +260,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; /* dirent sorts in alphabet order, thus we can do binary search */ @@ -337,21 +331,20 @@ struct z_erofs_zstd_cfgs { #define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE /* - * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) - * e.g. for 4k logical cluster size, 4B if compacted 2B is off; - * (4B) + 2B + (4B) if compacted 2B is on. - * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) - * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) - * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) - * bit 4 : interlaced plain pcluster (0 - off; 1 - on) - * bit 5 : fragment pcluster (0 - off; 1 - on) + * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes: + * 4B (disabled) vs 4B+2B+4B (enabled) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */ +#define Z_EROFS_ADVISE_EXTENTS 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 #define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +/* Indicate the record size for each extent if extent metadata is used */ +#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1 +#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3 #define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { @@ -363,45 +356,24 @@ struct z_erofs_map_header { /* indicates the encoded size of tailpacking data */ __le16 h_idata_size; }; + __le32 h_extents_lo; /* extent count LSB */ }; __le16 h_advise; - /* - * bit 0-3 : algorithm type of head 1 (logical cluster type 01); - * bit 4-7 : algorithm type of head 2 (logical cluster type 11). - */ - __u8 h_algorithmtype; - /* - * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-6 : reserved; - * bit 7 : move the whole file into packed inode or not. - */ - __u8 h_clusterbits; + union { + struct { + /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */ + __u8 h_algorithmtype; + /* + * bit 0-3 : logical cluster bits - blkszbits + * bit 4-6 : reserved + * bit 7 : pack the whole file into packed inode + */ + __u8 h_clusterbits; + }; + __le16 h_extents_hi; /* extent count MSB */ + }; }; -/* - * On-disk logical cluster type: - * 0 - literal (uncompressed) lcluster - * 1,3 - compressed lcluster (for HEAD lclusters) - * 2 - compressed lcluster (for NONHEAD lclusters) - * - * In detail, - * 0 - literal (uncompressed) lcluster, - * di_advise = 0 - * di_clusterofs = the literal data offset of the lcluster - * di_blkaddr = the blkaddr of the literal pcluster - * - * 1,3 - compressed lcluster (for HEAD lclusters) - * di_advise = 1 or 3 - * di_clusterofs = the decompressed data offset of the lcluster - * di_blkaddr = the blkaddr of the compressed pcluster - * - * 2 - compressed lcluster (for NONHEAD lclusters) - * di_advise = 2 - * di_clusterofs = - * the decompressed data offset in its own HEAD lcluster - * di_u.delta[0] = distance to this HEAD lcluster - * di_u.delta[1] = distance to the next HEAD lcluster - */ enum { Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, @@ -415,11 +387,7 @@ enum { /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) -/* - * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the - * compressed block count of a compressed extent (in logical clusters, aka. - * block count of a pcluster). - */ +/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */ #define Z_EROFS_LI_D0_CBLKCNT (1 << 11) struct z_erofs_lcluster_index { @@ -428,19 +396,36 @@ struct z_erofs_lcluster_index { __le16 di_clusterofs; union { - /* for the HEAD lclusters */ - __le32 blkaddr; + __le32 blkaddr; /* for the HEAD lclusters */ /* - * for the NONHEAD lclusters * [0] - distance to its HEAD lcluster * [1] - distance to the next HEAD lcluster */ - __le16 delta[2]; + __le16 delta[2]; /* for the NONHEAD lclusters */ } di_u; }; -#define Z_EROFS_FULL_INDEX_ALIGN(end) \ - (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) +#define Z_EROFS_MAP_HEADER_END(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header)) +#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8) + +#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27) +#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28 +#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1) +struct z_erofs_extent { + __le32 plen; /* encoded length */ + __le32 pstart_lo; /* physical offset */ + __le32 pstart_hi; /* physical offset MSB */ + __le32 lstart_lo; /* logical offset */ + __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */ + __u8 reserved[12]; /* for future use */ +}; + +static inline int z_erofs_extent_recsize(unsigned int advise) +{ + return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) & + Z_EROFS_ADVISE_EXTRECSZ_MASK); +} /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 0ffd1c63beeb..bec4b56b3826 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -112,7 +112,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) void *src; src = erofs_read_metabuf(&buf, inode->i_sb, - map->m_pa + ofs, EROFS_KMAP); + map->m_pa + ofs, true); if (IS_ERR(src)) { err = PTR_ERR(src); break; diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce3d8737df85..9c9129bca346 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -276,7 +276,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) size_t size = map.m_llen; void *src; - src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, true); if (IS_ERR(src)) return PTR_ERR(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d4b89407822a..a0ae0b4f7b01 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode)); + unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode)); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi = EROFS_SB(sb); + erofs_blk_t addrmask = BIT_ULL(48) - 1; struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; - void *kaddr; + struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - struct erofs_inode_extended *die, *copied = NULL; - union erofs_inode_i_u iu; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int ifmt, ofs; + unsigned int ifmt; + void *ptr; int err = 0; - blkaddr = erofs_blknr(sb, inode_loc); - ofs = erofs_blkoff(sb, inode_loc); - - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(kaddr)); - return PTR_ERR(kaddr); + ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode (nid: %llu) page, err %d", + vi->nid, err); + goto err_out; } - dic = kaddr + ofs; + dic = ptr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -73,40 +71,34 @@ static int erofs_read_inode(struct inode *inode) if (ofs + vi->inode_isize <= sb->s_blocksize) { ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; + copied.i_u = die->i_u; + copied.i_nb = die->i_nb; } else { const unsigned int gotten = sb->s_blocksize - ofs; - copied = kmalloc(vi->inode_isize, GFP_KERNEL); - if (!copied) { - err = -ENOMEM; + memcpy(&copied, dic, gotten); + ptr = erofs_read_metabuf(&buf, sb, + erofs_pos(sb, blkaddr + 1), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d", + vi->nid, err); goto err_out; } - memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), - EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", - vi->nid, PTR_ERR(kaddr)); - kfree(copied); - return PTR_ERR(kaddr); - } ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, ofs); - die = copied; + memcpy((u8 *)&copied + gotten, ptr, ofs); + die = &copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* each extended inode has its own timestamp */ - inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + inode_set_mtime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - kfree(copied); break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -114,12 +106,20 @@ static int erofs_read_inode(struct inode *inode) vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - iu = dic->i_u; + copied.i_u = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ - inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); + if (!S_ISDIR(inode->i_mode) && + ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) { + set_nlink(inode, 1); + copied.i_nb = dic->i_nb; + } else { + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); + copied.i_nb.startblk_hi = 0; + addrmask = BIT_ULL(32) - 1; + } + inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime), + sbi->fixed_nsec); inode->i_size = le32_to_cpu(dic->i_size); break; @@ -136,19 +136,26 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } switch (inode->i_mode & S_IFMT) { - case S_IFREG: case S_IFDIR: + vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; + fallthrough; + case S_IFREG: case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) | + ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); + if (vi->datalayout == EROFS_INODE_FLAT_PLAIN && + !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask)) + vi->startblk = EROFS_NULL_ADDR; + if(S_ISLNK(inode->i_mode)) { - err = erofs_fill_symlink(inode, kaddr, ofs); + err = erofs_fill_symlink(inode, ptr, ofs); if (err) goto err_out; } break; case S_IFCHR: case S_IFBLK: - inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev)); break; case S_IFIFO: case S_IFSOCK: @@ -161,12 +168,15 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) { - nblks = le32_to_cpu(iu.compressed_blocks); - } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << + (sb->s_blocksize_bits - 9); + else + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(iu.c.format); + vi->chunkformat = le16_to_cpu(copied.i_u.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); @@ -176,22 +186,15 @@ static int erofs_read_inode(struct inode *inode) vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode_set_mtime_to_ts(inode, - inode_set_atime_to_ts(inode, inode_get_ctime(inode))); + inode_set_atime_to_ts(inode, + inode_set_ctime_to_ts(inode, inode_get_mtime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && (vi->datalayout == EROFS_INODE_FLAT_PLAIN || vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; - - if (!nblks) - /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; - else - inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); err_out: - DBG_BUGON(err); erofs_put_metabuf(&buf); return err; } @@ -202,13 +205,10 @@ static int erofs_fill_inode(struct inode *inode) int err; trace_erofs_fill_inode(inode); - - /* read inode base data from disk */ err = erofs_read_inode(inode); if (err) return err; - /* setup the new inode */ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; @@ -229,15 +229,10 @@ static int erofs_fill_inode(struct inode *inode) inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + default: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); return 0; - default: - return -EFSCORRUPTED; } mapping_set_large_folios(inode->i_mapping); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 686d835eb533..4ac188d5d894 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -37,8 +37,7 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); typedef u64 erofs_nid_t; typedef u64 erofs_off_t; -/* data type for filesystem-wide blocks number */ -typedef u32 erofs_blk_t; +typedef u64 erofs_blk_t; struct erofs_device_info { char *path; @@ -47,8 +46,8 @@ struct erofs_device_info { struct dax_device *dax_dev; u64 dax_part_off; - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; enum { @@ -143,8 +142,8 @@ struct erofs_sb_info { unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ - u32 build_time_nsec; - u64 build_time; + u32 fixed_nsec; + s64 epoch; /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; @@ -152,8 +151,6 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; - u8 uuid[16]; /* 128-bit uuid for volume */ - u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; @@ -199,11 +196,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -enum erofs_kmap_type { - EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap_local_page() to map the buffer */ -}; - struct erofs_buf { struct address_space *mapping; struct file *file; @@ -212,8 +204,8 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) -#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits)) +#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) @@ -233,6 +225,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) @@ -252,6 +245,7 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; + bool dot_omitted; unsigned int xattr_isize; unsigned int xattr_name_filter; @@ -259,7 +253,7 @@ struct erofs_inode { unsigned int *xattr_shared_xattrs; union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; @@ -268,15 +262,13 @@ struct erofs_inode { struct { unsigned short z_advise; unsigned char z_algorithmtype[2]; - unsigned char z_logical_clusterbits; - unsigned long z_tailextent_headlcn; + unsigned char z_lclusterbits; union { - struct { - erofs_off_t z_idataoff; - unsigned short z_idata_size; - }; - erofs_off_t z_fragmentoff; + u64 z_tailextent_headlcn; + u64 z_extents; }; + erofs_off_t z_fragmentoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -387,11 +379,10 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type); + erofs_off_t offset, bool need_kmap); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -448,6 +439,7 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, @@ -457,7 +449,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} @@ -466,7 +457,7 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index c94d0c1608a8..f7cf4f41af28 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_dirent *de; buf.mapping = dir->i_mapping; - de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 827b62665649..cadec6b1b554 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -94,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) return ptr; @@ -110,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) { kfree(buffer); return ptr; @@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct file *file; - dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); + dis = erofs_read_metabuf(buf, sb, *pos, true); if (IS_ERR(dis)) return PTR_ERR(dis); @@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->file = file; } - dif->blocks = le32_to_cpu(dis->blocks); - dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + dif->blocks = le32_to_cpu(dis->blocks_lo); + dif->uniaddr = le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; @@ -255,7 +255,7 @@ static int erofs_read_superblock(struct super_block *sb) void *data; int ret; - data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + data = erofs_read_metabuf(&buf, sb, 0, true); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); @@ -268,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } - sbi->blkszbits = dsb->blkszbits; + sbi->blkszbits = dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; @@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->dif0.blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -308,23 +308,20 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid = le16_to_cpu(dsb->root_nid); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); + sbi->dif0.blocks = (sbi->dif0.blocks << 32) | + le16_to_cpu(dsb->rb.blocks_hi); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); - sbi->build_time = le64_to_cpu(dsb->build_time); - sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - + sbi->epoch = (s64)le64_to_cpu(dsb->epoch); + sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); - ret = strscpy(sbi->volume_name, dsb->volume_name, - sizeof(dsb->volume_name)); - if (ret < 0) { /* -E2BIG */ - erofs_err(sb, "bad volume name without NIL terminator"); - ret = -EFSCORRUPTED; - goto out; - } - /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -333,6 +330,8 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); + if (erofs_sb_has_48bit(sbi)) + erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: @@ -639,9 +638,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -653,24 +659,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err = PTR_ERR(sbi->packed_inode); - sbi->packed_inode = NULL; - return err; - } - } - err = erofs_init_managed_cache(sb); - if (err) - return err; - err = erofs_xattr_prefixes_init(sb); if (err) return err; @@ -806,6 +799,16 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) +{ + iput(sbi->packed_inode); + sbi->packed_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -815,6 +818,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_drop_internal_inodes(sbi); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); @@ -825,17 +829,10 @@ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; -#endif - iput(sbi->packed_inode); - sbi->packed_inode = NULL; + erofs_drop_internal_inodes(sbi); erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 19d586273b70..dad4e6c6c155 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -81,6 +81,7 @@ EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); +EROFS_ATTR_FEATURE(48bit); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -93,6 +94,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), + ATTR_LIST(48bit), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index df2777e05661..9cf84717a92e 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,7 +81,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; @@ -102,7 +102,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; @@ -183,7 +183,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -286,7 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -330,7 +330,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -367,7 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d771e06db738..0671184d9cf1 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,8 +44,8 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ struct z_erofs_pcluster *next; - /* I: start block address of this pcluster */ - erofs_off_t index; + /* I: start physical position of this pcluster */ + erofs_off_t pos; /* L: the maximum decompression size of this round */ unsigned int length; @@ -73,6 +73,9 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; + /* I: whether compressed data is in-lined or not */ + bool from_meta; + /* L: whether partial decompression or not */ bool partial; @@ -102,14 +105,9 @@ struct z_erofs_decompressqueue { bool eio, sync; }; -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->index; -} - static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; + return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; } static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) @@ -133,7 +131,7 @@ struct z_erofs_pcluster_slab { static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), - _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; struct z_erofs_bvec_iter { @@ -267,7 +265,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -516,6 +513,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) struct z_erofs_pcluster *pcl = fe->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); + pgoff_t poff = pcl->pos >> PAGE_SHIFT; bool may_bypass = true; /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | @@ -532,7 +530,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - folio = filemap_get_folio(mc, pcl->index + i); + folio = filemap_get_folio(mc, poff + i); if (IS_ERR(folio)) { may_bypass = false; if (!shouldalloc) @@ -575,7 +573,7 @@ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, struct folio *folio; int i; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); /* Each cached folio contains one page unless bs > ps is supported */ for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page) { @@ -607,7 +605,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; spin_lock(&pcl->lockref.lock); if (pcl->lockref.count <= 0) { - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { bvec->page = NULL; @@ -644,18 +642,18 @@ static const struct address_space_operations z_erofs_cache_aops = { .invalidate_folio = z_erofs_cache_invalidate_folio, }; -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { struct inode *const inode = new_inode(sb); if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } @@ -667,16 +665,20 @@ static int z_erofs_attach_page(struct z_erofs_frontend *fe, int ret; if (exclusive) { - /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->lockref.lock); - while (fe->icur > 0) { - if (pcl->compressed_bvecs[--fe->icur].page) - continue; - pcl->compressed_bvecs[fe->icur] = *bvec; + /* Inplace I/O is limited to one page for uncompressed data */ + if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || + fe->icur <= 1) { + /* Try to prioritize inplace I/O here */ + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->lockref.lock); + return 0; + } spin_unlock(&pcl->lockref.lock); - return 0; } - spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -711,27 +713,26 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl, *pre; + unsigned int pageofs_in; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(map->m_plen); + pageofs_in = erofs_blkoff(sb, map->m_pa); + pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; + pcl->pclustersize = map->m_plen; + pcl->pageofs_in = pageofs_in; pcl->length = 0; pcl->partial = true; pcl->next = fe->head; + pcl->pos = map->m_pa; + pcl->pageofs_in = pageofs_in; pcl->pageofs_out = map->m_la & ~PAGE_MASK; + pcl->from_meta = map->m_flags & EROFS_MAP_META; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* @@ -741,13 +742,10 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); - if (ztailpacking) { - pcl->index = 0; /* which indicates ztailpacking */ - } else { - pcl->index = erofs_blknr(sb, map->m_pa); + if (!pcl->from_meta) { while (1) { xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, NULL, pcl, GFP_KERNEL); if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { xa_unlock(&sbi->managed_pslots); @@ -779,7 +777,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; - erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct z_erofs_pcluster *pcl = NULL; int ret; @@ -790,9 +787,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) if (!(map->m_flags & EROFS_MAP_META)) { while (1) { rcu_read_lock(); - pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); if (!pcl || z_erofs_get_pcluster(pcl)) { - DBG_BUGON(pcl && blknr != pcl->index); + DBG_BUGON(pcl && map->m_pa != pcl->pos); rcu_read_unlock(); break; } @@ -826,13 +823,13 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - if (!z_erofs_is_inline_pcluster(fe->pcl)) { + if (!fe->pcl->from_meta) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { void *mptr; - mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP); + mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); @@ -871,7 +868,7 @@ static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, * It's impossible to fail after the pcluster is freezed, but in order * to avoid some race conditions, add a DBG_BUGON to observe this. */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl); lockref_mark_dead(&pcl->lockref); return true; @@ -967,7 +964,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, pos, EROFS_KMAP); + src = erofs_bread(&buf, pos, true); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); @@ -1221,7 +1218,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl) || + if (pcl->from_meta || erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; @@ -1284,6 +1281,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, + .inpages = pclusterpages, + .outpages = be->nr_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = pcl->pclustersize, @@ -1297,7 +1296,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) }, be->pagepool); /* must handle all compressed pages before actual file pages */ - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); @@ -1357,7 +1356,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); - if (z_erofs_is_inline_pcluster(pcl)) + if (pcl->from_meta) z_erofs_free_pcluster(pcl); else z_erofs_put_pcluster(sbi, pcl, try_free); @@ -1538,7 +1537,7 @@ out_allocfolio: folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { + filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1655,19 +1654,20 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f, pcl = next; next = READ_ONCE(pcl->next); - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->index), + .m_pa = round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + pcl->pclustersize; + end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, + sb->s_blocksize); do { bvec.bv_page = NULL; if (bio && (cur != last_pa || diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 689437e99a5a..8de50df05dfe 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; unsigned int advise; - di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; @@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - m->clusterofs = 1 << vi->z_logical_clusterbits; + m->clusterofs = 1 << vi->z_lclusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | @@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); - if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + if (m->clusterofs >= 1 << vi->z_lclusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize); + const unsigned int lclusterbits = vi->z_lclusterbits; const unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; @@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; - in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true); if (IS_ERR(in)) return PTR_ERR(in); @@ -255,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lclusterbits = vi->z_lclusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; @@ -265,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, if (err) return err; - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; if (!lookback_distance) - goto err_bogus; + break; continue; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + } else { m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; - default: - erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } } -err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); @@ -308,7 +304,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || - (lcn << vi->z_logical_clusterbits) >= inode->i_size) + (lcn << vi->z_lclusterbits) >= inode->i_size) m->compressedblks = 1; if (m->compressedblks) @@ -329,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + if (m->compressedblks) + goto out; + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 block-sized pcluster. */ m->compressedblks = 1; - break; - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (m->delta[0] != 1) - goto err_bonus_cblkcnt; - if (m->compressedblks) - break; - fallthrough; - default: - erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; + goto out; } + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; out: m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; -err_bonus_cblkcnt: - erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) @@ -365,7 +354,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; - unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned int lclusterbits = vi->z_lclusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; @@ -386,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) m->delta[1] = 1; DBG_BUGON(1); } - } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; @@ -404,23 +391,32 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } -static int z_erofs_do_map_blocks(struct inode *inode, +static int z_erofs_map_blocks_fo(struct inode *inode, struct erofs_map_blocks *map, int flags) { - struct erofs_inode *const vi = EROFS_I(inode); - bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + bool ztailpacking = vi->z_idata_size; + unsigned int lclusterbits = vi->z_lclusterbits; struct z_erofs_maprecorder m = { .inode = inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff, afmt; + unsigned int endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; - lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + return 0; + } initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); @@ -428,9 +424,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (err) goto unmap_out; - if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) - vi->z_idataoff = m.nextpackoff; - + if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking) + vi->z_fragmentoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -452,8 +447,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { - erofs_err(inode->i_sb, - "invalid logical cluster 0 at nid %llu", + erofs_err(sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -469,8 +463,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto unmap_out; break; default: - erofs_err(inode->i_sb, - "unknown type %u @ offset %llu of nid %llu", + erofs_err(sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; @@ -487,12 +480,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; - map->m_pa = vi->z_idataoff; + map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map->m_plen); + err = -EFSCORRUPTED; + goto unmap_out; + } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { - map->m_pa = erofs_pos(inode->i_sb, m.pblk); + map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; @@ -511,7 +510,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -535,6 +534,115 @@ unmap_out: return err; } +static int z_erofs_map_blocks_ext(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; + bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER; + unsigned int recsz = z_erofs_extent_recsize(vi->z_advise); + erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize), recsz); + erofs_off_t lend = inode->i_size; + erofs_off_t l, r, mid, pa, la, lstart; + struct z_erofs_extent *ext; + unsigned int fmt; + bool last; + + map->m_flags = 0; + if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) { + if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + pa = le64_to_cpu(*(__le64 *)ext); + pos += sizeof(__le64); + lstart = 0; + } else { + lstart = map->m_la >> vi->z_lclusterbits; + pa = EROFS_NULL_ADDR; + } + + for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + map->m_plen = le32_to_cpu(ext->plen); + if (pa != EROFS_NULL_ADDR) { + map->m_pa = pa; + pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK; + } else { + map->m_pa = le32_to_cpu(ext->pstart_lo); + } + pos += recsz; + } + last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits)); + lend = min(lstart, lend); + lstart -= 1 << vi->z_lclusterbits; + } else { + lstart = lend; + for (l = 0, r = vi->z_extents; l < r; ) { + mid = l + (r - l) / 2; + ext = erofs_read_metabuf(&map->buf, sb, + pos + mid * recsz, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + + la = le32_to_cpu(ext->lstart_lo); + pa = le32_to_cpu(ext->pstart_lo) | + (u64)le32_to_cpu(ext->pstart_hi) << 32; + if (recsz > offsetof(struct z_erofs_extent, lstart_hi)) + la |= (u64)le32_to_cpu(ext->lstart_hi) << 32; + + if (la > map->m_la) { + r = mid; + lend = la; + } else { + l = mid + 1; + if (map->m_la == la) + r = min(l + 1, r); + lstart = la; + map->m_plen = le32_to_cpu(ext->plen); + map->m_pa = pa; + } + } + last = (l >= vi->z_extents); + } + + if (lstart < lend) { + map->m_la = lstart; + if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + vi->z_fragmentoff = map->m_plen; + if (recsz >= offsetof(struct z_erofs_extent, pstart_lo)) + vi->z_fragmentoff |= map->m_pa << 32; + } else if (map->m_plen) { + map->m_flags |= EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; + fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; + if (fmt) + map->m_algorithmformat = fmt - 1; + else if (interlaced && !erofs_blkoff(sb, map->m_pa)) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL) + map->m_flags |= EROFS_MAP_PARTIAL_REF; + map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK; + } + } + map->m_llen = lend - map->m_la; + if (!last && map->m_llen < sb->s_blocksize) { + erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", + map->m_llen, map->m_la, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -561,7 +669,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); + h = erofs_read_metabuf(&buf, sb, pos, true); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; @@ -578,8 +686,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto done; } vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15); + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) { + vi->z_extents = le32_to_cpu(h->h_extents_lo) | + ((u64)le16_to_cpu(h->h_extents_hi) << 32); + goto done; + } + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) + vi->z_idata_size = le16_to_cpu(h->h_idata_size); headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || @@ -590,7 +710,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -608,34 +727,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + if (vi->z_idata_size || + (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, + err = z_erofs_map_blocks_fo(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) @@ -666,15 +764,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, } else { err = z_erofs_fill_inode_lazy(inode); if (!err) { - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; - } else { - err = z_erofs_do_map_blocks(inode, map, flags); - } + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) + err = z_erofs_map_blocks_ext(inode, map, flags); + else + err = z_erofs_map_blocks_fo(inode, map, flags); } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9b06a0ab9c32..100376863a44 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -447,7 +447,7 @@ static bool ep_busy_loop(struct eventpoll *ep) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) @@ -492,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -546,7 +546,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } @@ -554,7 +554,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } diff --git a/fs/exec.c b/fs/exec.c index 506cd411f4ac..f45859ad13ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -755,8 +755,6 @@ int setup_arg_pages(struct linux_binprm *bprm, mm->arg_start = bprm->p; #endif - if (bprm->loader) - bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 9ff825f1502d..cc01556c9d9b 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -147,7 +147,6 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) unsigned int ent_idx; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_mount_options *opts = &sbi->options; if (!is_valid_cluster(sbi, clu)) return -EIO; @@ -163,19 +162,6 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) exfat_update_bh(sbi->vol_amap[i], sync); - if (opts->discard) { - int ret_discard; - - ret_discard = sb_issue_discard(sb, - exfat_cluster_to_sector(sbi, clu), - (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); - - if (ret_discard == -EOPNOTSUPP) { - exfat_err(sb, "discard not supported by device, disabling"); - opts->discard = 0; - } - } - return 0; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index d30ce18a88b7..f8ead4d47ef0 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -14,8 +14,6 @@ #define EXFAT_ROOT_INO 1 -#define EXFAT_CLUSTERS_UNTRACKED (~0u) - /* * exfat error flags */ diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 6f3651c6ca91..23065f948ae7 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -144,6 +144,20 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, return 0; } +static inline void exfat_discard_cluster(struct super_block *sb, + unsigned int clu, unsigned int num_clusters) +{ + int ret; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + ret = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu), + sbi->sect_per_clus * num_clusters, GFP_NOFS, 0); + if (ret == -EOPNOTSUPP) { + exfat_err(sb, "discard not supported by device, disabling"); + sbi->options.discard = 0; + } +} + /* This function must be called with bitmap_lock held */ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) { @@ -196,7 +210,12 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain clu++; num_clusters++; } while (num_clusters < p_chain->size); + + if (sbi->options.discard) + exfat_discard_cluster(sb, p_chain->dir, p_chain->size); } else { + unsigned int nr_clu = 1; + do { bool sync = false; unsigned int n_clu = clu; @@ -215,6 +234,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)))) break; + + if (sbi->options.discard) { + if (n_clu == clu + 1) + nr_clu++; + else { + exfat_discard_cluster(sb, clu - nr_clu + 1, nr_clu); + nr_clu = 1; + } + } + clu = n_clu; num_clusters++; @@ -265,7 +294,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, clu = next; if (exfat_ent_get(sb, clu, &next)) return -EIO; - } while (next != EXFAT_EOF_CLUSTER); + } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size); if (p_chain->size != count) { exfat_fs_error(sb, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 807349d8ea05..841a5b18e3df 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -582,6 +582,9 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t pos = iocb->ki_pos; loff_t valid_size; + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + inode_lock(inode); valid_size = ei->valid_size; @@ -635,6 +638,16 @@ unlock: return ret; } +static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + + return generic_file_read_iter(iocb, iter); +} + static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) { int err; @@ -672,14 +685,26 @@ static const struct vm_operations_struct exfat_file_vm_ops = { static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) { + if (unlikely(exfat_forced_shutdown(file_inode(file)->i_sb))) + return -EIO; + file_accessed(file); vma->vm_ops = &exfat_file_vm_ops; return 0; } +static ssize_t exfat_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) +{ + if (unlikely(exfat_forced_shutdown(file_inode(in)->i_sb))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = exfat_file_read_iter, .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT @@ -687,7 +712,7 @@ const struct file_operations exfat_file_operations = { #endif .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, - .splice_read = filemap_splice_read, + .splice_read = exfat_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 96952d4acb50..b22c02d6000f 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, sector_t last_block; sector_t phys = 0; sector_t valid_blks; + loff_t i_size; mutex_lock(&sbi->s_lock); - last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + i_size = i_size_read(inode); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb); if (iblock >= last_block && !create) goto done; @@ -305,77 +307,99 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); - if (create) { + /* + * In most cases, we just need to set bh_result to mapped, unmapped + * or new status as follows: + * 1. i_size == valid_size + * 2. write case (create == 1) + * 3. direct_read (!bh_result->b_folio) + * -> the unwritten part will be zeroed in exfat_direct_IO() + * + * Otherwise, in the case of buffered read, it is necessary to take + * care the last nested block if valid_size is not equal to i_size. + */ + if (i_size == ei->valid_size || create || !bh_result->b_folio) valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + else + valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The range has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; - goto done; - } + /* The range has been fully written, map it */ + if (iblock + max_blocks < valid_blks) + goto done; - /* The area has not been written, map and mark as new. */ - set_buffer_new(bh_result); + /* The range has been partially written, map the written part */ + if (iblock < valid_blks) { + max_blocks = valid_blks - iblock; + goto done; + } + /* The area has not been written, map and mark as new for create case */ + if (create) { + set_buffer_new(bh_result); ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); mark_inode_dirty(inode); - } else { - valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); + goto done; + } - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ + /* + * The area has just one block partially written. + * In that case, we should read and fill the unwritten part of + * a block with zero. + */ + if (bh_result->b_folio && iblock == valid_blks && + (ei->valid_size & (sb->s_blocksize - 1))) { + loff_t size, pos; + void *addr; + + max_blocks = 1; + + /* + * No buffer_head is allocated. + * (1) bmap: It's enough to set blocknr without I/O. + * (2) read: The unwritten part should be filled with zero. + * If a folio does not have any buffers, + * let's returns -EAGAIN to fallback to + * block_read_full_folio() for per-bh IO. + */ + if (!folio_buffers(bh_result->b_folio)) { + err = -EAGAIN; goto done; - } else if (iblock < valid_blks) { - /* - * The area has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; + } + + pos = EXFAT_BLK_TO_B(iblock, sb); + size = ei->valid_size - pos; + addr = folio_address(bh_result->b_folio) + + offset_in_folio(bh_result->b_folio, pos); + + /* Check if bh->b_data points to proper addr in folio */ + if (bh_result->b_data != addr) { + exfat_fs_error_ratelimit(sb, + "b_data(%p) != folio_addr(%p)", + bh_result->b_data, addr); + err = -EINVAL; goto done; - } else if (iblock == valid_blks && - (ei->valid_size & (sb->s_blocksize - 1))) { - /* - * The block has been partially written, - * zero the unwritten part and map the block. - */ - loff_t size, off, pos; - - max_blocks = 1; - - /* - * For direct read, the unwritten part will be zeroed in - * exfat_direct_IO() - */ - if (!bh_result->b_folio) - goto done; - - pos = EXFAT_BLK_TO_B(iblock, sb); - size = ei->valid_size - pos; - off = pos & (PAGE_SIZE - 1); - - folio_set_bh(bh_result, bh_result->b_folio, off); - err = bh_read(bh_result, 0); - if (err < 0) - goto unlock_ret; - - folio_zero_segment(bh_result->b_folio, off + size, - off + sb->s_blocksize); - } else { - /* - * The range has not been written, clear the mapped flag - * to only zero the cache and do not read from disk. - */ - clear_buffer_mapped(bh_result); } + + /* Read a block */ + err = bh_read(bh_result, 0); + if (err < 0) + goto done; + + /* Zero unwritten part of a block */ + memset(bh_result->b_data + size, 0, bh_result->b_size - size); + err = 0; + goto done; } + + /* + * The area has not been written, clear mapped for read/bmap cases. + * If so, it will be filled with zero without reading from disk. + */ + clear_buffer_mapped(bh_result); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); + if (err < 0) + clear_buffer_mapped(bh_result); unlock_ret: mutex_unlock(&sbi->s_lock); return err; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index bd57844414aa..8465033a6cf0 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -67,15 +67,6 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev); - if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) { - mutex_lock(&sbi->s_lock); - if (exfat_count_used_clusters(sb, &sbi->used_clusters)) { - mutex_unlock(&sbi->s_lock); - return -EIO; - } - mutex_unlock(&sbi->s_lock); - } - buf->f_type = sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ @@ -531,7 +522,6 @@ static int exfat_read_boot_sector(struct super_block *sb) sbi->vol_flags = le16_to_cpu(p_boot->vol_flags); sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; - sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; /* check consistencies */ if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits < diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index f38bdd46e4f7..4025f875252a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -368,6 +368,7 @@ struct ext2_inode { #define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */ #define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */ #define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */ +#define EXT2_MOUNT_ERRORS_MASK 0x000070 #define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ #define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 37f7ce56adce..28ff47ec4be6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -23,7 +23,8 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/random.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -40,7 +41,6 @@ #include "acl.h" static void ext2_write_super(struct super_block *sb); -static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); static int ext2_freeze(struct super_block *sb); @@ -81,6 +81,33 @@ void ext2_error(struct super_block *sb, const char *function, } } +static void ext2_msg_fc(struct fs_context *fc, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *s_id; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + s_id = fc->root->d_sb->s_id; + } else { + /* get last path component of source */ + s_id = strrchr(fc->source, '/'); + if (s_id) + s_id++; + else + s_id = fc->source; + } + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT2-fs (%s): %pV\n", prefix, s_id, &vaf); + + va_end(args); +} + void ext2_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { @@ -346,7 +373,6 @@ static const struct super_operations ext2_sops = { .freeze_fs = ext2_freeze, .unfreeze_fs = ext2_unfreeze, .statfs = ext2_statfs, - .remount_fs = ext2_remount, .show_options = ext2_show_options, #ifdef CONFIG_QUOTA .quota_read = ext2_quota_read, @@ -402,230 +428,217 @@ static const struct export_operations ext2_export_ops = { .get_parent = ext2_get_parent, }; -static unsigned long get_sb_block(void **data) -{ - unsigned long sb_block; - char *options = (char *) *data; - - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ - options += 3; - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - printk("EXT2-fs: Invalid sb specification: %s\n", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; - return sb_block; -} - enum { - Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, - Opt_err_ro, Opt_nouid32, Opt_debug, - Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, - Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, + Opt_sb, Opt_errors, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_dax, Opt_ignore, + Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, }; -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, - {Opt_nobh, "nobh"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_xip, "xip"}, - {Opt_dax, "dax"}, - {Opt_grpquota, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, - {Opt_err, NULL} +static const struct constant_table ext2_param_errors[] = { + {"continue", EXT2_MOUNT_ERRORS_CONT}, + {"panic", EXT2_MOUNT_ERRORS_PANIC}, + {"remount-ro", EXT2_MOUNT_ERRORS_RO}, + {} +}; + +static const struct fs_parameter_spec ext2_param_spec[] = { + fsparam_flag ("bsddf", Opt_bsd_df), + fsparam_flag ("minixdf", Opt_minix_df), + fsparam_flag ("grpid", Opt_grpid), + fsparam_flag ("bsdgroups", Opt_grpid), + fsparam_flag ("nogrpid", Opt_nogrpid), + fsparam_flag ("sysvgroups", Opt_nogrpid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), + fsparam_u32 ("sb", Opt_sb), + fsparam_enum ("errors", Opt_errors, ext2_param_errors), + fsparam_flag ("nouid32", Opt_nouid32), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("oldalloc", Opt_oldalloc), + fsparam_flag ("orlov", Opt_orlov), + fsparam_flag ("nobh", Opt_nobh), + fsparam_flag_no ("user_xattr", Opt_user_xattr), + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag ("xip", Opt_xip), + fsparam_flag ("dax", Opt_dax), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag_no ("reservation", Opt_reservation), + {} +}; + +#define EXT2_SPEC_s_resuid (1 << 0) +#define EXT2_SPEC_s_resgid (1 << 1) + +struct ext2_fs_context { + unsigned long vals_s_flags; /* Bits to set in s_flags */ + unsigned long mask_s_flags; /* Bits changed in s_flags */ + unsigned int vals_s_mount_opt; + unsigned int mask_s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_sb_block; + unsigned int spec; + }; -static int parse_options(char *options, struct super_block *sb, - struct ext2_mount_options *opts) +static inline void ctx_set_mount_opt(struct ext2_fs_context *ctx, + unsigned long flag) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - kuid_t uid; - kgid_t gid; - - if (!options) - return 1; - - while ((p = strsep (&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - clear_opt (opts->s_mount_opt, MINIX_DF); - break; - case Opt_minix_df: - set_opt (opts->s_mount_opt, MINIX_DF); - break; - case Opt_grpid: - set_opt (opts->s_mount_opt, GRPID); - break; - case Opt_nogrpid: - clear_opt (opts->s_mount_opt, GRPID); - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); - return 0; - - } - opts->s_resuid = uid; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); - return 0; - } - opts->s_resgid = gid; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt (opts->s_mount_opt, ERRORS_CONT); - clear_opt (opts->s_mount_opt, ERRORS_RO); - set_opt (opts->s_mount_opt, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt (opts->s_mount_opt, ERRORS_CONT); - clear_opt (opts->s_mount_opt, ERRORS_PANIC); - set_opt (opts->s_mount_opt, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt (opts->s_mount_opt, ERRORS_RO); - clear_opt (opts->s_mount_opt, ERRORS_PANIC); - set_opt (opts->s_mount_opt, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt (opts->s_mount_opt, NO_UID32); - break; - case Opt_debug: - set_opt (opts->s_mount_opt, DEBUG); - break; - case Opt_oldalloc: - set_opt (opts->s_mount_opt, OLDALLOC); - break; - case Opt_orlov: - clear_opt (opts->s_mount_opt, OLDALLOC); - break; - case Opt_nobh: - ext2_msg(sb, KERN_INFO, - "nobh option not supported"); - break; + ctx->mask_s_mount_opt |= flag; + ctx->vals_s_mount_opt |= flag; +} + +static inline void ctx_clear_mount_opt(struct ext2_fs_context *ctx, + unsigned long flag) +{ + ctx->mask_s_mount_opt |= flag; + ctx->vals_s_mount_opt &= ~flag; +} + +static inline unsigned long +ctx_test_mount_opt(struct ext2_fs_context *ctx, unsigned long flag) +{ + return (ctx->vals_s_mount_opt & flag); +} + +static inline bool +ctx_parsed_mount_opt(struct ext2_fs_context *ctx, unsigned long flag) +{ + return (ctx->mask_s_mount_opt & flag); +} + +static void ext2_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct ext2_fs_context *ctx = fc->fs_private; + int opt; + struct fs_parse_result result; + + opt = fs_parse(fc, ext2_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_bsd_df: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_MINIX_DF); + break; + case Opt_minix_df: + ctx_set_mount_opt(ctx, EXT2_MOUNT_MINIX_DF); + break; + case Opt_grpid: + ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPID); + break; + case Opt_nogrpid: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_GRPID); + break; + case Opt_resuid: + ctx->s_resuid = result.uid; + ctx->spec |= EXT2_SPEC_s_resuid; + break; + case Opt_resgid: + ctx->s_resgid = result.gid; + ctx->spec |= EXT2_SPEC_s_resgid; + break; + case Opt_sb: + /* Note that this is silently ignored on remount */ + ctx->s_sb_block = result.uint_32; + break; + case Opt_errors: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK); + ctx_set_mount_opt(ctx, result.uint_32); + break; + case Opt_nouid32: + ctx_set_mount_opt(ctx, EXT2_MOUNT_NO_UID32); + break; + case Opt_debug: + ctx_set_mount_opt(ctx, EXT2_MOUNT_DEBUG); + break; + case Opt_oldalloc: + ctx_set_mount_opt(ctx, EXT2_MOUNT_OLDALLOC); + break; + case Opt_orlov: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_OLDALLOC); + break; + case Opt_nobh: + ext2_msg_fc(fc, KERN_INFO, "nobh option not supported\n"); + break; #ifdef CONFIG_EXT2_FS_XATTR - case Opt_user_xattr: - set_opt (opts->s_mount_opt, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt (opts->s_mount_opt, XATTR_USER); - break; + case Opt_user_xattr: + if (!result.negated) + ctx_set_mount_opt(ctx, EXT2_MOUNT_XATTR_USER); + else + ctx_clear_mount_opt(ctx, EXT2_MOUNT_XATTR_USER); + break; #else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext2_msg(sb, KERN_INFO, "(no)user_xattr options" - "not supported"); - break; + case Opt_user_xattr: + ext2_msg_fc(fc, KERN_INFO, "(no)user_xattr options not supported"); + break; #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL - case Opt_acl: - set_opt(opts->s_mount_opt, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(opts->s_mount_opt, POSIX_ACL); - break; + case Opt_acl: + if (!result.negated) + ctx_set_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL); + else + ctx_clear_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL); + break; #else - case Opt_acl: - case Opt_noacl: - ext2_msg(sb, KERN_INFO, - "(no)acl options not supported"); - break; + case Opt_acl: + ext2_msg_fc(fc, KERN_INFO, "(no)acl options not supported"); + break; #endif - case Opt_xip: - ext2_msg(sb, KERN_INFO, "use dax instead of xip"); - set_opt(opts->s_mount_opt, XIP); - fallthrough; - case Opt_dax: + case Opt_xip: + ext2_msg_fc(fc, KERN_INFO, "use dax instead of xip"); + ctx_set_mount_opt(ctx, EXT2_MOUNT_XIP); + fallthrough; + case Opt_dax: #ifdef CONFIG_FS_DAX - ext2_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(opts->s_mount_opt, DAX); + ext2_msg_fc(fc, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX); #else - ext2_msg(sb, KERN_INFO, "dax option not supported"); + ext2_msg_fc(fc, KERN_INFO, "dax option not supported"); #endif - break; + break; #if defined(CONFIG_QUOTA) - case Opt_quota: - case Opt_usrquota: - set_opt(opts->s_mount_opt, USRQUOTA); - break; - - case Opt_grpquota: - set_opt(opts->s_mount_opt, GRPQUOTA); - break; + case Opt_quota: + case Opt_usrquota: + ctx_set_mount_opt(ctx, EXT2_MOUNT_USRQUOTA); + break; + + case Opt_grpquota: + ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPQUOTA); + break; #else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext2_msg(sb, KERN_INFO, - "quota operations not supported"); - break; + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext2_msg_fc(fc, KERN_INFO, "quota operations not supported"); + break; #endif - - case Opt_reservation: - set_opt(opts->s_mount_opt, RESERVATION); - ext2_msg(sb, KERN_INFO, "reservations ON"); - break; - case Opt_noreservation: - clear_opt(opts->s_mount_opt, RESERVATION); - ext2_msg(sb, KERN_INFO, "reservations OFF"); - break; - case Opt_ignore: - break; - default: - return 0; + case Opt_reservation: + if (!result.negated) { + ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + ext2_msg_fc(fc, KERN_INFO, "reservations ON"); + } else { + ctx_clear_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + ext2_msg_fc(fc, KERN_INFO, "reservations OFF"); } + break; + case Opt_ignore: + break; + default: + return -EINVAL; } - return 1; + return 0; } static int ext2_setup_super (struct super_block * sb, @@ -801,24 +814,83 @@ static unsigned long descriptor_loc(struct super_block *sb, return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg); } -static int ext2_fill_super(struct super_block *sb, void *data, int silent) +/* + * Set all mount options either from defaults on disk, or from parsed + * options. Parsed/specified options override on-disk defaults. + */ +static void ext2_set_options(struct fs_context *fc, struct ext2_sb_info *sbi) +{ + struct ext2_fs_context *ctx = fc->fs_private; + struct ext2_super_block *es = sbi->s_es; + unsigned long def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + /* Copy parsed mount options to sbi */ + sbi->s_mount_opt = ctx->vals_s_mount_opt; + + /* Use in-superblock defaults only if not specified during parsing */ + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_DEBUG) && + def_mount_opts & EXT2_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_GRPID) && + def_mount_opts & EXT2_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_NO_UID32) && + def_mount_opts & EXT2_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); + +#ifdef CONFIG_EXT2_FS_XATTR + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_XATTR_USER) && + def_mount_opts & EXT2_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL) && + def_mount_opts & EXT2_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK)) { + if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + } + + if (ctx->spec & EXT2_SPEC_s_resuid) + sbi->s_resuid = ctx->s_resuid; + else + sbi->s_resuid = make_kuid(&init_user_ns, + le16_to_cpu(es->s_def_resuid)); + + if (ctx->spec & EXT2_SPEC_s_resgid) + sbi->s_resgid = ctx->s_resgid; + else + sbi->s_resgid = make_kgid(&init_user_ns, + le16_to_cpu(es->s_def_resgid)); +} + +static int ext2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct ext2_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; struct inode *root; unsigned long block; - unsigned long sb_block = get_sb_block(&data); + unsigned long sb_block = ctx->s_sb_block; unsigned long logic_sb_block; unsigned long offset = 0; - unsigned long def_mount_opts; long ret = -ENOMEM; int blocksize = BLOCK_SIZE; int db_count; int i, j; __le32 features; int err; - struct ext2_mount_options opts; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -877,42 +949,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_magic != EXT2_SUPER_MAGIC) goto cantfind_ext2; - opts.s_mount_opt = 0; - /* Set defaults before we parse the mount options */ - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - if (def_mount_opts & EXT2_DEFM_DEBUG) - set_opt(opts.s_mount_opt, DEBUG); - if (def_mount_opts & EXT2_DEFM_BSDGROUPS) - set_opt(opts.s_mount_opt, GRPID); - if (def_mount_opts & EXT2_DEFM_UID16) - set_opt(opts.s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT2_FS_XATTR - if (def_mount_opts & EXT2_DEFM_XATTR_USER) - set_opt(opts.s_mount_opt, XATTR_USER); -#endif -#ifdef CONFIG_EXT2_FS_POSIX_ACL - if (def_mount_opts & EXT2_DEFM_ACL) - set_opt(opts.s_mount_opt, POSIX_ACL); -#endif - - if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) - set_opt(opts.s_mount_opt, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) - set_opt(opts.s_mount_opt, ERRORS_CONT); - else - set_opt(opts.s_mount_opt, ERRORS_RO); - - opts.s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - opts.s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - - set_opt(opts.s_mount_opt, RESERVATION); - - if (!parse_options((char *) data, sb, &opts)) - goto failed_mount; - - sbi->s_mount_opt = opts.s_mount_opt; - sbi->s_resuid = opts.s_resuid; - sbi->s_resgid = opts.s_resgid; + ext2_set_options(fc, sbi); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -1324,23 +1361,21 @@ static void ext2_write_super(struct super_block *sb) ext2_sync_fs(sb, 1); } -static int ext2_remount (struct super_block * sb, int * flags, char * data) +static int ext2_reconfigure(struct fs_context *fc) { + struct ext2_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; struct ext2_mount_options new_opts; + int flags = fc->sb_flags; int err; sync_filesystem(sb); - spin_lock(&sbi->s_lock); - new_opts.s_mount_opt = sbi->s_mount_opt; - new_opts.s_resuid = sbi->s_resuid; - new_opts.s_resgid = sbi->s_resgid; - spin_unlock(&sbi->s_lock); - - if (!parse_options(data, sb, &new_opts)) - return -EINVAL; + new_opts.s_mount_opt = ctx->vals_s_mount_opt; + new_opts.s_resuid = ctx->s_resuid; + new_opts.s_resgid = ctx->s_resgid; spin_lock(&sbi->s_lock); es = sbi->s_es; @@ -1349,9 +1384,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) "dax flag with busy inodes while remounting"); new_opts.s_mount_opt ^= EXT2_MOUNT_DAX; } - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(flags & SB_RDONLY) == sb_rdonly(sb)) goto out_set; - if (*flags & SB_RDONLY) { + if (flags & SB_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || !(sbi->s_mount_state & EXT2_VALID_FS)) goto out_set; @@ -1470,10 +1505,9 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) return 0; } -static struct dentry *ext2_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ext2_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); + return get_tree_bdev(fc, ext2_fill_super); } #ifdef CONFIG_QUOTA @@ -1556,7 +1590,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); @@ -1624,12 +1658,49 @@ out: #endif +static const struct fs_context_operations ext2_context_ops = { + .parse_param = ext2_parse_param, + .get_tree = ext2_get_tree, + .reconfigure = ext2_reconfigure, + .free = ext2_free_fc, +}; + +static int ext2_init_fs_context(struct fs_context *fc) +{ + struct ext2_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + spin_lock(&sbi->s_lock); + ctx->vals_s_mount_opt = sbi->s_mount_opt; + ctx->vals_s_flags = sb->s_flags; + ctx->s_resuid = sbi->s_resuid; + ctx->s_resgid = sbi->s_resgid; + spin_unlock(&sbi->s_lock); + } else { + ctx->s_sb_block = 1; + ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + } + + fc->fs_private = ctx; + fc->ops = &ext2_context_ops; + + return 0; +} + static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .mount = ext2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = ext2_init_fs_context, + .parameters = ext2_param_spec, }; MODULE_ALIAS_FS("ext2"); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8042ad873808..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 2a135075468d..a4dbaccee6e7 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -25,7 +25,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; sz = EXT4_INODES_PER_GROUP(sb) >> 3; @@ -48,7 +48,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; @@ -67,7 +67,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); @@ -89,7 +89,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 02d47a64e8d1..d4164c507a90 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); - bool has_csum = ext4_has_metadata_csum(dir->i_sb); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; /* Can we just clear INDEX flag to ignore htree information? */ - if (!ext4_has_metadata_csum(sb)) { + if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e7de7eaa374..5a20e9cd7184 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -278,7 +278,10 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -367,6 +370,8 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -1058,7 +1063,8 @@ struct ext4_inode_info { /* Number of ongoing updates on this inode */ atomic_t i_fc_updates; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; @@ -1097,8 +1103,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. @@ -1141,6 +1145,7 @@ struct ext4_inode_info { /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif + spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; @@ -1151,8 +1156,6 @@ struct ext4_inode_info { struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - spinlock_t i_block_reservation_lock; - /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. @@ -1606,6 +1609,8 @@ struct ext4_sb_info { unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1821,7 +1826,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -2232,15 +2238,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ -#define EXT4_FLAGS_RESIZING 0 -#define EXT4_FLAGS_SHUTDOWN 1 -#define EXT4_FLAGS_BDEV_IS_DAX 2 +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } +static inline int ext4_emergency_ro(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; +} + /* * Default values for user and/or group using reserved blocks */ @@ -2278,6 +2301,13 @@ static inline int ext4_forced_shutdown(struct super_block *sb) #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ /* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + + +/* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ @@ -2810,8 +2840,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); @@ -3001,6 +3030,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -3259,14 +3290,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_metadata_csum(struct super_block *sb) -{ - return ext4_has_feature_metadata_csum(sb); -} - static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ @@ -3546,11 +3573,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); @@ -3785,34 +3812,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - struct inode *inode = io_end->inode; - - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - } } extern const struct iomap_ops ext4_iomap_ops; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index da4a82456383..135e278c832e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -63,12 +63,14 @@ static void ext4_put_nojournal(handle_t *handle) */ static int ext4_journal_check_start(struct super_block *sb) { + int ret; journal_t *journal; might_sleep(); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; @@ -244,7 +246,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, } } else ext4_check_bdev_write_error(sb); - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, @@ -331,7 +334,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, err); return err; } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..3221714d9901 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,90 +122,6 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void _ext4_journal_callback_add(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - /* Add the jce to transaction's private list */ - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -} - -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, jce); - spin_unlock(&sbi->s_md_lock); -} - - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - * Return true if object was successfully removed - */ -static inline bool ext4_journal_callback_try_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - bool deleted; - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - deleted = !list_empty(&jce->jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - return deleted; -} - int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a07a98a4b97a..c616a16a9f36 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -63,7 +63,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -77,7 +77,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -4568,131 +4568,65 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. Here, start and partial_begin are inclusive, end and - * partial_end are exclusive. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4705,27 +4639,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4739,12 +4715,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4764,73 +4736,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; - - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + goto out_inode_lock; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_ZERO_RANGE) { - ret = ext4_zero_range(file, offset, len, mode); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; - - max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - inode_lock(inode); /* - * We only support preallocation for extent-based files only + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; - } - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + filemap_invalidate_lock(mapping); - ret = file_modified(file); + ret = ext4_break_layouts(inode); if (ret) - goto out; + goto out_invalidate_lock; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = ext4_punch_hole(file, offset, len); + else if (mode & FALLOC_FL_COLLAPSE_RANGE) + ret = ext4_collapse_range(file, offset, len); + else if (mode & FALLOC_FL_INSERT_RANGE) + ret = ext4_insert_range(file, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + ret = ext4_zero_range(file, offset, len, mode); + else + ret = -EOPNOTSUPP; - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: return ret; } @@ -5332,109 +5278,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); - if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; + + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5442,18 +5351,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5473,100 +5380,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } depth = ext_depth(inode); @@ -5576,16 +5446,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - offset_lblk, split_flag, + start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -5594,32 +5464,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } } ext4_free_ext_path(path); - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ae29832aab1e..d1401d4a5513 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1551,7 +1551,6 @@ retry: ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3bd96c3d4cd0..beb078ee4811 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -688,10 +688,12 @@ out: static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + int ret; struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) @@ -700,7 +702,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); - int ret; if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) @@ -800,11 +801,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; /* * We don't support synchronous mappings for non-DAX files and @@ -835,7 +841,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); @@ -878,8 +885,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index b40d3b29f7e5..e476c6de3074 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); - if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_ext4_flags value */ - smp_rmb(); - if (ext4_forced_shutdown(inode->i_sb)) - ret = -EROFS; + if (sb_rdonly(inode->i_sb)) goto out; - } if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index deabe29da7fb..33cd5b6b02d5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + buff = kzalloc(PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21d228073d79..38bc8d74f4cc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -951,8 +951,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return ERR_PTR(-EIO); + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1282,7 +1283,7 @@ got: inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); @@ -1298,7 +1299,7 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..f608f6554b95 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -20,6 +20,11 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) @@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -653,91 +658,109 @@ out_nofolio: } /* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop) +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; - goto out; + goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_stop_journal; - /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out; + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); - goto out; + goto out_stop_journal; } - *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; - folio_unlock(folio); - folio_put(folio); - goto out_up_read; + goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) { - folio_unlock(folio); - folio_put(folio); - goto out_up_read; - } + if (ret < 0) + goto out_release_folio; } - ret = 1; - handle = NULL; -out_up_read: + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle && (ret != 1)) - ext4_journal_stop(handle); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: brelse(iloc.bh); return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, inode); +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, @@ -881,94 +904,6 @@ out: return ret; } -/* - * Prepare the write for the inline data. - * If the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata) -{ - int ret; - handle_t *handle; - struct folio *folio; - struct ext4_iloc iloc; - int retries = 0; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - fsdata); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto out_journal; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) - goto out_release_page; - } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out_release_page; - - up_read(&EXT4_I(inode)->xattr_sem); - *foliop = folio; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - folio_unlock(folio); - folio_put(folio); -out_journal: - ext4_journal_stop(handle); -out: - brelse(iloc.bh); - return ret; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1012,7 +947,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; @@ -1146,7 +1081,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d04d8a7f12e7..bcb96caf77c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -93,7 +94,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -114,7 +115,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -751,7 +752,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -1149,8 +1150,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* @@ -2225,7 +2227,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2273,7 +2275,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2599,10 +2601,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2817,8 +2818,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2858,8 +2860,9 @@ static int ext4_dax_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); @@ -2915,8 +2918,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; @@ -2929,8 +2933,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - foliop, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -3906,6 +3910,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3950,91 +4016,50 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + WARN_ON_ONCE(!inode_is_locked(inode)); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) - goto out_mutex; + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size, and also make sure that the hole + * within one block before last range. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - ret = file_modified(file); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -4044,54 +4069,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { - ext4_lblk_t hole_len = stop_block - first_block; + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, hole_len); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } - ext4_es_insert_extent(inode, first_block, hole_len, ~0, + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4678,6 +4700,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4804,7 +4831,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); @@ -4891,7 +4918,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -5011,8 +5039,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { @@ -5232,8 +5268,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5355,8 +5392,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5468,7 +5506,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) - goto err_out; + goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); @@ -5800,9 +5838,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5826,8 +5865,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7b9ce71c1c81..d17207386ead 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -142,7 +142,7 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); - if (ext4_has_metadata_csum(sb) && + if (ext4_has_feature_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); @@ -150,7 +150,7 @@ static int ext4_update_backup_sb(struct super_block *sb, goto out_bh; } func(es, arg); - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -351,7 +351,7 @@ void ext4_reset_inode_seed(struct inode *inode) __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); @@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp, * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; @@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(idmap, inode)) return -EPERM; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -1705,7 +1706,7 @@ int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index bb2a223b207c..d634c12f1984 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); @@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b25a27c86696..0d523e9fb3d5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -187,7 +187,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -209,7 +209,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -5653,7 +5653,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5687,7 +5687,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d64c04ed061a..3e26464b1425 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -21,7 +21,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 716cc6096870..cb5cb33b1d91 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, brelse(bh); return ERR_PTR(-EFSCORRUPTED); } - if (!ext4_has_metadata_csum(inode->i_sb) || + if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -291,36 +291,6 @@ struct dx_tail { __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(struct ext4_filename *fname, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct buffer_head *bh, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, - char *to, struct dx_map_entry *offsets, - int count, unsigned int blocksize); -static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, - unsigned int blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); @@ -398,7 +368,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); @@ -419,7 +389,7 @@ static void ext4_dirblock_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); @@ -494,7 +464,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -523,7 +493,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -612,7 +582,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -622,7 +592,7 @@ static inline unsigned dx_node_limit(struct inode *dir) unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1076,7 +1046,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; - int csum = ext4_has_metadata_csum(dir->i_sb); + int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1320,7 +1290,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { @@ -1462,7 +1432,8 @@ static bool ext4_match(struct inode *parent, * sure cf_name was properly initialized before * considering the calculated hash. */ - if (IS_ENCRYPTED(parent) && fname->cf_name.name && + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; @@ -1595,10 +1566,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else goto cleanup_and_exit; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); @@ -1945,7 +1921,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); @@ -2060,8 +2036,7 @@ out: return ERR_PTR(err); } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) @@ -2143,11 +2118,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, int csum_size = 0; int err, err2; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; @@ -2252,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct fake_dirent *fde; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -2396,7 +2371,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -2427,7 +2402,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; @@ -2577,8 +2552,10 @@ again: BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); @@ -2589,8 +2566,10 @@ again: err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); @@ -2609,8 +2588,10 @@ again: dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); @@ -2635,8 +2616,10 @@ again: "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; @@ -2733,7 +2716,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2973,7 +2956,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3151,8 +3134,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -3309,8 +3293,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; trace_ext4_unlink_enter(dir, dentry); /* @@ -3376,8 +3361,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -4199,8 +4185,9 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) - return -EIO; + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..c66e0cb29bd4 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -537,7 +537,7 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 37abee5016c3..179e54f3a3b6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); + if (io_end->flag & EXT4_IO_END_FAILED || + (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec))) { + ext4_add_complete_io(io_end); return; } - ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 72f77f78ae8d..b7ff0d955f0d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1118,7 +1118,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); } @@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a50e5c31b937..8122d4ffb3b5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, @@ -302,7 +301,7 @@ __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); @@ -312,7 +311,7 @@ void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); @@ -448,9 +447,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ -#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. @@ -458,8 +454,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: - * 1. More than an hour has passed since the last superblock update, and - * 2. More than 16MB have been written since the last superblock update. + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. * * @sb: The superblock */ @@ -473,14 +471,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb) __u64 lifetime_write_kbytes; __u64 diff_size; - if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || - !journal || (journal->j_flags & JBD2_UNMOUNT)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); - if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + @@ -495,32 +494,18 @@ static void ext4_maybe_update_superblock(struct super_block *sb) */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); - if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); - - spin_lock(&sbi->s_md_lock); - while (!list_empty(&txn->t_private_list)) { - jce = list_entry(txn->t_private_list.next, - struct ext4_journal_cb_entry, jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); } /* @@ -707,11 +692,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!continue_fs && !sb_rdonly(sb)) { - set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -719,9 +701,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); @@ -737,17 +723,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_id); } - if (sb_rdonly(sb) || continue_fs) + if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem - * modifications. We don't set SB_RDONLY because that requires - * sb->s_umount semaphore and setting it without proper remount - * procedure is confusing code such as freeze_super() leading to - * deadlocks and other problems. + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. */ + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) @@ -765,7 +751,8 @@ static void update_super_work(struct work_struct *work) * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ - if (!sb_rdonly(sbi->s_sb) && journal) { + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; @@ -819,7 +806,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -844,7 +831,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -879,7 +866,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -959,7 +946,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -1053,7 +1040,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -1306,18 +1293,17 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -1325,13 +1311,14 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!sb_rdonly(sb) && !aborted) { - ext4_clear_feature_journal_needs_recovery(sb); - ext4_clear_feature_orphan_present(sb); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (!sb_rdonly(sb)) + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } ext4_commit_super(sb); + } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); @@ -1426,7 +1413,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); @@ -2785,6 +2771,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3205,7 +3204,7 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sbi->s_sb)) { + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; @@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; @@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (sb_rdonly(sb) || + if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; @@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb, if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) @@ -4642,7 +4642,8 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; @@ -4973,10 +4974,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_sb_upd_work before destroying the journal. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5013,6 +5011,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb) return 0; } +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5263,6 +5279,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is @@ -5404,30 +5422,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "suppressed and not mounted read-only"); goto failed_mount3a; } else { + const char *journal_option; + /* Nojournal mode, all journal mount options are illegal */ - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); goto failed_mount3a; } - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "commit=%lu, fs mounted w/o journal", - sbi->s_commit_interval / HZ); - goto failed_mount3a; - } - if (EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "data=, fs mounted w/o journal"); - goto failed_mount3a; - } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5616,9 +5621,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount9; } - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5665,10 +5672,7 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_sb_upd_work before journal destroy. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5773,10 +5777,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. @@ -5973,7 +5973,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); @@ -6090,8 +6090,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6109,7 +6108,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6336,8 +6335,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6419,7 +6419,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6575,7 +6575,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_forced_shutdown(sb)) { + if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } @@ -6780,6 +6780,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6791,9 +6792,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6817,22 +6818,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6935,12 +6943,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6951,6 +6972,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7288,7 +7313,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); @@ -7381,12 +7406,9 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - static int __init ext4_init_fs(void) { - int i, err; + int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; @@ -7394,9 +7416,6 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) - init_waitqueue_head(&ext4__ioend_wq[i]); - err = ext4_init_es(); if (err) return err; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ddb54608ca2e..987bd00f916a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7647e9f6e190..7ab8f2e8e815 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); @@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } @@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static inline int +int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { @@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, function, line); } -#define xattr_check_inode(inode, header, end) \ - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) - static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) @@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; + end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) @@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; - void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); -cleanup: brelse(iloc.bh); return error; } @@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; - void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); @@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - ret = xattr_check_inode(inode, header, end); - if (ret) - goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = xattr_check_inode(inode, header, is->s.end); - if (error) - return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); @@ -2786,14 +2775,10 @@ retry: */ base = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; - ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b25c2d7b5f99..1fedf44d4fb6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -67,6 +67,9 @@ struct ext4_xattr_entry { ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* @@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index efda9a022981..cf77987d0698 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -21,7 +21,7 @@ #include "iostat.h" #include <trace/events/f2fs.h> -#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -58,7 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page; + struct folio *folio; struct f2fs_io_info fio = { .sbi = sbi, .type = META, @@ -74,37 +74,37 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: - page = f2fs_grab_cache_page(mapping, index, false); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - fio.page = page; + fio.page = &folio->page; err = f2fs_submit_page_bio(&fio); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { - f2fs_handle_page_eio(sbi, page_folio(page), META); - f2fs_put_page(page, 1); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_handle_page_eio(sbi, folio, META); + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } out: - return page; + return &folio->page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) @@ -381,12 +381,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) -{ - return __f2fs_write_meta_page(page, wbc, FS_META_IO); -} - static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -507,7 +501,6 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, } const struct address_space_operations f2fs_meta_aops = { - .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -1237,7 +1230,7 @@ static int block_operations(struct f2fs_sb_info *sbi) retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { - int locked; + bool need_lock = sbi->umount_lock_holder != current; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); @@ -1246,11 +1239,13 @@ retry_flush_quotas: } f2fs_unlock_all(sbi); - /* only failed during mount/umount/freeze/quotactl */ - locked = down_read_trylock(&sbi->sb->s_umount); - f2fs_quota_sync(sbi->sb, -1); - if (locked) + /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */ + if (!need_lock) { + f2fs_do_quota_sync(sbi->sb, -1); + } else if (down_read_trylock(&sbi->sb->s_umount)) { + f2fs_do_quota_sync(sbi->sb, -1); up_read(&sbi->sb->s_umount); + } cond_resched(); goto retry_flush_quotas; } @@ -1344,21 +1339,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to no space"); - } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && - f2fs_nat_bitmap_enabled(sbi)) { - f2fs_enable_nat_bits(sbi); - set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Rebuild and enable nat_bits"); - } - } - spin_lock_irqsave(&sbi->cp_lock, flags); + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else @@ -1541,8 +1528,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if ((cpc->reason & CP_UMOUNT) && - is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; @@ -1867,7 +1853,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); - if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || + sbi->umount_lock_holder == current) { int ret; f2fs_down_write(&sbi->gc_lock); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 985690d81a82..9b94810675c1 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1150,6 +1150,7 @@ retry: f2fs_compress_ctx_add_page(cc, page_folio(page)); if (!PageUptodate(page)) { + f2fs_handle_page_eio(sbi, page_folio(page), DATA); release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ece5208223c1..54f89f0ee69b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -319,8 +319,7 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; @@ -328,34 +327,41 @@ static void f2fs_write_end_io(struct bio *bio) if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page, false); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + enum count_type type; + + if (fscrypt_is_bounce_folio(folio)) { + struct folio *io_folio = folio; - fscrypt_finalize_bounce_page(&page); + folio = fscrypt_pagecache_folio(io_folio); + fscrypt_free_bounce_page(&io_folio->page); + } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(page)) { - f2fs_compress_write_end_io(bio, page); + if (f2fs_is_compressed_page(&folio->page)) { + f2fs_compress_write_end_io(bio, &folio->page); continue; } #endif + type = WB_DATA_TYPE(&folio->page, false); + if (unlikely(bio->bi_status)) { - mapping_set_error(page->mapping, -EIO); + mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && - page_folio(page)->index != nid_of_node(page)); + f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && + folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); - if (f2fs_in_warm_node_list(sbi, page)) - f2fs_del_fsync_node_entry(sbi, page); - clear_page_private_gcing(page); - end_page_writeback(page); + if (f2fs_in_warm_node_list(sbi, folio)) + f2fs_del_fsync_node_entry(sbi, &folio->page); + clear_page_private_gcing(&folio->page); + folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) @@ -413,6 +419,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -438,6 +445,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; + + if (fio->type == DATA && + F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + op_flags |= REQ_PRIO; + return op_flags; } @@ -876,6 +888,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct folio *folio = page_folio(fio->page); if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) @@ -889,8 +902,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, - page_folio(fio->page)->index, fio, GFP_NOIO); + f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, + folio->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -899,8 +912,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1041,8 +1053,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); - if (!bio) - return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; @@ -1193,18 +1203,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, - pgoff_t *next_pgofs) +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; int err; - page = f2fs_grab_cache_page(mapping, index, for_write); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (IS_ERR(folio)) + return folio; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { @@ -1239,9 +1248,9 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, goto put_err; } got_it: - if (PageUptodate(page)) { - unlock_page(page); - return page; + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + return folio; } /* @@ -1252,48 +1261,51 @@ got_it: * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); - return page; + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + return folio; } - err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr, + err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; - return page; + return folio; put_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = find_get_page_flags(mapping, index, FGP_ACCESSED); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) + goto read; + if (folio_test_uptodate(folio)) + return folio; + f2fs_folio_put(folio, false); - page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); - if (IS_ERR(page)) - return page; +read: + folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); + if (IS_ERR(folio)) + return folio; - if (PageUptodate(page)) - return page; + if (folio_test_uptodate(folio)) + return folio; - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); + folio_wait_locked(folio); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -1301,23 +1313,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); - if (IS_ERR(page)) - return page; + folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); + if (IS_ERR(folio)) + return folio; /* wait for read completion */ - lock_page(page); - if (unlikely(page->mapping != mapping || !PageUptodate(page))) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -2178,6 +2190,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, int i; int ret = 0; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + from_dnode = false; + goto out_put_dnode; + } + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + @@ -2221,10 +2239,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; - if (unlikely(f2fs_cp_error(sbi))) { - ret = -EIO; - goto out_put_dnode; - } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: @@ -2921,29 +2935,6 @@ redirty_out: return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); -#ifdef CONFIG_F2FS_FS_COMPRESSION - struct inode *inode = folio->mapping->host; - - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) - goto out; - - if (f2fs_compressed_file(inode)) { - if (f2fs_is_compressed_cluster(inode, folio->index)) { - folio_redirty_for_writepage(wbc, folio); - return AOP_WRITEPAGE_ACTIVATE; - } - } -out: -#endif - - return f2fs_write_single_data_page(folio, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0, true); -} - /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -3266,10 +3257,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping, int ret; bool locked = false; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; @@ -3390,7 +3377,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, restart: /* check inline_data */ - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -3453,7 +3440,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -3483,7 +3470,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -4101,7 +4088,6 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, - .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, @@ -4195,7 +4181,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); - if (flags & IOMAP_WRITE) + + /* + * If the blocks being overwritten are already allocated, + * f2fs_map_lock and f2fs_balance_fs are not necessary. + */ + if ((flags & IOMAP_WRITE) && + !f2fs_overwrite_io(inode, offset, length)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 468828288a4a..16c2dfb4f595 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndonate_files = sbi->donate_files; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = atomic_read(&sbi->atomic_files); @@ -501,6 +502,8 @@ static int stat_show(struct seq_file *s, void *v) si->compr_inode, si->compr_blocks); seq_printf(s, " - Swapfile Inode: %u\n", si->swapfile_inode); + seq_printf(s, " - Donate Inode: %u\n", + si->ndonate_files); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 54dd52de7269..5a63ff0df03b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -551,7 +551,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1afa7be16e7d..f1576dc6ec67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,6 +62,7 @@ enum { FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, + FAULT_INCONSISTENT_FOOTER, FAULT_MAX, }; @@ -114,6 +115,13 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_GC_MERGE 0x02000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 +#define F2FS_MOUNT_NAT_BITS 0x10000000 +#define F2FS_MOUNT_INLINECRYPT 0x20000000 +/* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ +#define F2FS_MOUNT_LAZYTIME 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -830,6 +838,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ @@ -849,6 +858,11 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + + /* linked in global inode list for cache donation */ + struct list_head gdonate_list; + pgoff_t donate_start, donate_end; /* inclusive */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ @@ -1273,6 +1287,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; @@ -1628,6 +1643,9 @@ struct f2fs_sb_info { unsigned int warm_data_age_threshold; unsigned int last_age_weight; + /* control donate caches */ + unsigned int donate_files; + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -1659,6 +1677,7 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1800,6 +1819,9 @@ struct f2fs_sb_info { u64 committed_atomic_block; u64 revoked_atomic_block; + /* carve out reserved_blocks from total blocks */ + bool carve_out; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -2015,7 +2037,7 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct page *page) { return (struct f2fs_node *)page_address(page); } @@ -2219,6 +2241,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); @@ -2765,33 +2817,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } -static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, - pgoff_t index, bool for_write) +static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, + pgoff_t index, bool for_write) { - struct page *page; + struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + fgf_t fgf_flags; + if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); + fgf_flags = FGP_LOCK | FGP_ACCESSED; else - page = find_lock_page(mapping, index); - if (page) - return page; + fgf_flags = FGP_LOCK; + folio = __filemap_get_folio(mapping, index, fgf_flags, 0); + if (!IS_ERR(folio)) + return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) - return NULL; + return ERR_PTR(-ENOMEM); } if (!for_write) - return grab_cache_page(mapping, index); + return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, index); + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); - return page; + return folio; +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write); + + if (IS_ERR(folio)) + return NULL; + return &folio->page; } static inline struct page *f2fs_pagecache_get_page( @@ -2804,16 +2869,23 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } -static inline void f2fs_put_page(struct page *page, int unlock) +static inline void f2fs_folio_put(struct folio *folio, bool unlock) { - if (!page) + if (!folio) return; if (unlock) { - f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); - unlock_page(page); + f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); + folio_unlock(folio); } - put_page(page); + folio_put(folio); +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -3624,7 +3696,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); -int f2fs_quota_sync(struct super_block *sb, int type); +int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); @@ -3647,7 +3719,8 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, + const struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); @@ -3662,12 +3735,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); @@ -3687,7 +3762,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); @@ -3758,8 +3832,10 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked); +#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ + f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); @@ -3871,11 +3947,11 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, - pgoff_t *next_pgofs); -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); @@ -3902,6 +3978,22 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; +static inline struct page *f2fs_find_data_page(struct inode *inode, + pgoff_t index, pgoff_t *next_pgofs) +{ + struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs); + + return &folio->page; +} + +static inline struct page *f2fs_get_lock_data_page(struct inode *inode, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write); + + return &folio->page; +} + /* * gc.c */ @@ -3966,7 +4058,8 @@ struct f2fs_stat_info { unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; @@ -4231,6 +4324,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); +unsigned int f2fs_donate_files(void); +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f92a9fba9991..abbcbb5865a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -707,31 +707,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; if (!offset && !cache_only) return 0; if (cache_only) { - page = find_lock_page(mapping, index); - if (page && PageUptodate(page)) + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) + return 0; + if (folio_test_uptodate(folio)) goto truncate_out; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } - page = f2fs_get_lock_data_page(inode, index, true); - if (IS_ERR(page)) - return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true, true); - zero_user(page, offset, PAGE_SIZE - offset); + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_segment(folio, offset, folio_size(folio)); /* An encrypted inode should have a key and truncate the last page. */ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); return 0; } @@ -759,7 +761,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -1834,18 +1836,32 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: + f2fs_down_write(&sbi->pin_sem); + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (has_not_enough_free_secs(sbi, 0, 0)) { + f2fs_up_write(&sbi->pin_sem); + err = -ENOSPC; + f2fs_warn_ratelimited(sbi, + "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "reclaim enough free segment when checkpoint is enabled", + inode->i_ino, pg_start, pg_end); + goto out_err; + } + } + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); - if (err && err != -ENODATA) + if (err && err != -ENODATA) { + f2fs_up_write(&sbi->pin_sem); goto out_err; + } } - f2fs_down_write(&sbi->pin_sem); - err = f2fs_allocate_pinning_section(sbi); if (err) { f2fs_up_write(&sbi->pin_sem); @@ -2448,6 +2464,52 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) return ret; } +static void f2fs_keep_noreuse_range(struct inode *inode, + loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + u64 start, end; + + if (!S_ISREG(inode->i_mode)) + return; + + if (offset >= max_bytes || len > max_bytes || + (offset + len) > max_bytes) + return; + + start = offset >> PAGE_SHIFT; + end = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) { + inode_unlock(inode); + return; + } + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + /* let's remove the range, if len = 0 */ + if (!len) { + if (!list_empty(&F2FS_I(inode)->gdonate_list)) { + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + } + } else { + if (list_empty(&F2FS_I(inode)->gdonate_list)) { + list_add_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + sbi->donate_files++; + } else { + list_move_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + } + F2FS_I(inode)->donate_start = start; + F2FS_I(inode)->donate_end = end - 1; + } + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + inode_unlock(inode); +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3446,6 +3508,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) (u32 __user *)arg); } +static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 level; + + if (get_user(level, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX) + return -EINVAL; + + inode_lock(inode); + F2FS_I(inode)->ioprio_hint = level; + inode_unlock(inode); + return 0; +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -4547,6 +4626,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_compress_file(filp); case F2FS_IOC_GET_DEV_ALIAS_FILE: return f2fs_ioc_get_dev_alias_file(filp, arg); + case F2FS_IOC_IO_PRIO: + return f2fs_ioc_io_prio(filp, arg); default: return -ENOTTY; } @@ -5147,12 +5228,16 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, } err = generic_fadvise(filp, offset, len, advice); - if (!err && advice == POSIX_FADV_DONTNEED && - test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && - f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + if (err) + return err; - return err; + if (advice == POSIX_FADV_DONTNEED && + (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode))) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + else if (advice == POSIX_FADV_NOREUSE) + f2fs_keep_noreuse_range(inode, offset, len); + return 0; } #ifdef CONFIG_COMPAT @@ -5261,6 +5346,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: case F2FS_IOC_GET_DEV_ALIAS_FILE: + case F2FS_IOC_IO_PRIO: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faf9fa1c804d..2b8f9239bede 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1449,14 +1449,14 @@ out: } static int move_data_page(struct inode *inode, block_t bidx, int gc_type, - unsigned int segno, int off) + unsigned int segno, int off) { - struct page *page; + struct folio *folio; int err = 0; - page = f2fs_get_lock_data_page(inode, bidx, true); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, bidx, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; @@ -1468,12 +1468,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; if (gc_type == BG_GC) { - if (folio_test_writeback(page_folio(page))) { + if (folio_test_writeback(folio)) { err = -EAGAIN; goto out; } - set_page_dirty(page); - set_page_private_gcing(page); + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,37 +1483,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = page, + .page = &folio->page, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, }; - bool is_dirty = PageDirty(page); + bool is_dirty = folio_test_dirty(folio); retry: - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { + folio_mark_dirty(folio); + if (folio_clear_dirty_for_io(folio)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(page); + set_page_private_gcing(&folio->page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(page); + clear_page_private_gcing(&folio->page); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) - set_page_dirty(page); + folio_mark_dirty(folio); } } out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -1542,7 +1542,6 @@ next_step: entry = sum; for (off = 0; off < usable_blks_in_seg; off++, entry++) { - struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; @@ -1585,6 +1584,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + struct folio *data_folio; int err; inode = f2fs_iget(sb, dni.ino); @@ -1635,15 +1635,15 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, start_bidx, + data_folio = f2fs_get_read_data_folio(inode, start_bidx, REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (IS_ERR(data_page)) { + if (IS_ERR(data_folio)) { iput(inode); continue; } - f2fs_put_page(data_page, 0); + f2fs_folio_put(data_folio, false); add_gc_inode(gc_list, inode); continue; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3e3c35d4c98b..ad92e9008781 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -119,7 +119,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { struct page *ipage; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { folio_unlock(folio); return PTR_ERR(ipage); @@ -237,7 +237,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -265,7 +265,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *ipage; - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -312,7 +312,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -331,7 +331,7 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); @@ -361,7 +361,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct page *ipage; void *inline_dentry; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -609,7 +609,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) if (err) goto out; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out_fname; @@ -644,7 +644,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct page *page = NULL; int err = 0; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -734,7 +734,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -765,7 +765,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -797,7 +797,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3dd25f64d6f1..83f862578fc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -34,10 +34,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) { - set_inode_flag(inode, FI_ATOMIC_DIRTIED); + if (f2fs_is_atomic_file(inode)) return; - } mark_inode_dirty_sync(inode); } @@ -410,7 +408,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -757,7 +755,7 @@ void f2fs_update_inode_page(struct inode *inode) struct page *node_page; int count = 0; retry: - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); @@ -765,8 +763,12 @@ retry: if (err == -ENOENT) return; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; +stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } @@ -789,6 +791,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + /* + * no need to update inode page, ultimately f2fs_evict_inode() will + * clear dirty status of inode. + */ + if (f2fs_cp_error(sbi)) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) { f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; @@ -804,6 +813,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } +static void f2fs_remove_donate_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (list_empty(&F2FS_I(inode)->gdonate_list)) + return; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + spin_unlock(&sbi->inode_lock[DONATE_INODE]); +} + /* * Called at the last iput() if i_nlink is zero */ @@ -838,6 +860,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); + f2fs_remove_donate_inode(inode); if (!IS_DEVICE_ALIASING(inode)) f2fs_destroy_extent_tree(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 24dca4dc85a9..8f8b9b843bdf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + if (inode->i_nlink == 0) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + goto out_iput; + } + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f88392fc4ba9..5f15c224bf78 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio) { - return NODE_MAPPING(sbi) == page->mapping && - IS_DNODE(page) && is_cold_node(page); + return NODE_MAPPING(sbi) == folio->mapping && + IS_DNODE(&folio->page) && is_cold_node(&folio->page); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -778,7 +778,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = f2fs_get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_inode_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -1130,26 +1130,33 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) unsigned int nofs = 0; struct f2fs_inode *ri; struct dnode_of_data dn; - struct page *page; + struct folio *folio; trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) { + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; } - page = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - set_new_dnode(&dn, inode, page, NULL, 0); - unlock_page(page); + set_new_dnode(&dn, inode, &folio->page, NULL, 0); + folio_unlock(folio); - ri = F2FS_INODE(page); + ri = F2FS_INODE(&folio->page); switch (level) { case 0: case 1: @@ -1178,7 +1185,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(page, offset[0], true); + dn.nid = get_nid(&folio->page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1199,7 +1206,7 @@ skip_partial: BUG(); } if (err == -ENOENT) { - set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, "truncate node fail, ino:%lu, nid:%u, " @@ -1210,18 +1217,18 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(page, offset[0], true)) { - lock_page(page); - BUG_ON(page->mapping != NODE_MAPPING(sbi)); - set_nid(page, offset[0], 0, true); - unlock_page(page); + if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + folio_lock(folio); + BUG_ON(folio->mapping != NODE_MAPPING(sbi)); + set_nid(&folio->page, offset[0], 0, true); + folio_unlock(folio); } offset[1] = 0; offset[0]++; nofs += err; } fail: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } @@ -1238,7 +1245,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = f2fs_get_node_page(sbi, nid); + npage = f2fs_get_xnode_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1449,10 +1456,32 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, - struct page *parent, int start) +static int sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct page *page, pgoff_t nid, + enum node_type ntype) { - struct page *page; + if (unlikely(nid != nid_of_node(page) || + (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + (ntype == NODE_TYPE_XATTR && + !f2fs_has_xattr_block(ofs_of_node(page))) || + time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; + } + return 0; +} + +static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start, + enum node_type ntype) +{ + struct folio *folio; int err; if (!nid) @@ -1460,11 +1489,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(folio)) + return folio; - err = read_node_page(page, 0); + err = read_node_page(&folio->page, 0); if (err < 0) { goto out_put_err; } else if (err == LOCKED_PAGE) { @@ -1475,54 +1504,72 @@ repeat: if (parent) f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto out_err; } - if (!f2fs_inode_chksum_verify(sbi, page)) { + if (!f2fs_inode_chksum_verify(sbi, &folio->page)) { err = -EFSBADCRC; goto out_err; } page_hit: - if (likely(nid == nid_of_node(page))) - return page; - - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), - next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - err = -EFSCORRUPTED; + err = sanity_check_node_footer(sbi, &folio->page, nid, ntype); + if (!err) + return folio; out_err: - ClearPageUptodate(page); + folio_clear_uptodate(folio); out_put_err: /* ENOENT comes from read_node_page which is not an error. */ if (err != -ENOENT) - f2fs_handle_page_eio(sbi, page_folio(page), NODE); - f2fs_put_page(page, 1); + f2fs_handle_page_eio(sbi, folio, NODE); + f2fs_folio_put(folio, true); return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - return __get_node_page(sbi, nid, NULL, 0); + struct folio *folio = __get_node_folio(sbi, nid, NULL, 0, + NODE_TYPE_REGULAR); + + return &folio->page; +} + +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); +} + +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + struct folio *folio = f2fs_get_inode_folio(sbi, ino); + + return &folio->page; +} + +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid) +{ + struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0, + NODE_TYPE_XATTR); + + return &folio->page; } struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); + struct folio *folio = __get_node_folio(sbi, nid, parent, start, + NODE_TYPE_REGULAR); - return __get_node_page(sbi, nid, parent, start); + return &folio->page; } static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) @@ -1561,11 +1608,11 @@ iput_out: iput(inode); } -static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; struct folio_batch fbatch; - struct page *last_page = NULL; + struct folio *last_folio = NULL; int nr_folios; folio_batch_init(&fbatch); @@ -1577,45 +1624,45 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); return ERR_PTR(-EIO); } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (last_page) - f2fs_put_page(last_page, 0); + if (last_folio) + f2fs_folio_put(last_folio, false); - get_page(page); - last_page = page; - unlock_page(page); + folio_get(folio); + last_folio = folio; + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } - return last_page; + return last_folio; } static int __write_node_page(struct page *page, bool atomic, bool *submitted, @@ -1694,7 +1741,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ - if (f2fs_in_warm_node_list(sbi, page)) { + if (f2fs_in_warm_node_list(sbi, folio)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; @@ -1769,13 +1816,6 @@ release_page: return err; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - return __write_node_page(page, false, NULL, wbc, false, - FS_NODE_IO, NULL); -} - int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id) @@ -1783,16 +1823,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, pgoff_t index; struct folio_batch fbatch; int ret = 0; - struct page *last_page = NULL; + struct folio *last_folio = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_folios; int nwritten = 0; if (atomic) { - last_page = last_fsync_dnode(sbi, ino); - if (IS_ERR_OR_NULL(last_page)) - return PTR_ERR_OR_ZERO(last_page); + last_folio = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_folio)) + return PTR_ERR_OR_ZERO(last_folio); } retry: folio_batch_init(&fbatch); @@ -1804,73 +1844,73 @@ retry: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); ret = -EIO; goto out; } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page) && page != last_page) { + if (!folio_test_dirty(folio) && folio != last_folio) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - if (!atomic || page == last_page) { - set_fsync_mark(page, 1); + if (!atomic || folio == last_folio) { + set_fsync_mark(&folio->page, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(page)) { + if (IS_INODE(&folio->page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - f2fs_update_inode(inode, page); - set_dentry_mark(page, + f2fs_update_inode(inode, &folio->page); + set_dentry_mark(&folio->page, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ - if (!PageDirty(page)) - set_page_dirty(page); + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - ret = __write_node_page(page, atomic && - page == last_page, + ret = __write_node_page(&folio->page, atomic && + folio == last_folio, &submitted, wbc, true, FS_NODE_IO, seq_id); if (ret) { - unlock_page(page); - f2fs_put_page(last_page, 0); + folio_unlock(folio); + f2fs_folio_put(last_folio, false); break; } else if (submitted) { nwritten++; } - if (page == last_page) { - f2fs_put_page(page, 0); + if (folio == last_folio) { + f2fs_folio_put(folio, false); marked = true; break; } @@ -1883,11 +1923,11 @@ continue_unlock: } if (!ret && atomic && !marked) { f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", - ino, page_folio(last_page)->index); - lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true, true); - set_page_dirty(last_page); - unlock_page(last_page); + ino, last_folio->index); + folio_lock(last_folio); + f2fs_folio_wait_writeback(last_folio, NODE, true, true); + folio_mark_dirty(last_folio); + folio_unlock(last_folio); goto retry; } out: @@ -1920,18 +1960,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) return 1; } -static bool flush_dirty_inode(struct page *page) +static bool flush_dirty_inode(struct folio *folio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(&folio->page); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) return false; - f2fs_update_inode(inode, page); - unlock_page(page); + f2fs_update_inode(inode, &folio->page); + folio_unlock(folio); iput(inode); return true; @@ -1951,32 +1991,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(page)) + if (!IS_INODE(&folio->page)) continue; - lock_page(page); - - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { -continue_unlock: - unlock_page(page); - continue; - } + folio_lock(folio); - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) + goto unlock; + if (!folio_test_dirty(folio)) + goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); continue; } - unlock_page(page); +unlock: + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -2005,7 +2040,7 @@ next_step: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2021,27 +2056,27 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(page)) + if (step == 0 && IS_DNODE(&folio->page)) continue; - if (step == 1 && (!IS_DNODE(page) || - is_cold_node(page))) + if (step == 1 && (!IS_DNODE(&folio->page) || + is_cold_node(&folio->page))) continue; - if (step == 2 && (!IS_DNODE(page) || - !is_cold_node(page))) + if (step == 2 && (!IS_DNODE(&folio->page) || + !is_cold_node(&folio->page))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) - lock_page(page); - else if (!trylock_page(page)) + folio_lock(folio); + else if (!folio_trylock(folio)) continue; - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } @@ -2051,29 +2086,29 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(page) && flush_dirty_inode(page)) + if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) goto lock_node; write_node: - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - ret = __write_node_page(page, false, &submitted, + ret = __write_node_page(&folio->page, false, &submitted, wbc, do_balance, io_type, NULL); if (ret) - unlock_page(page); + folio_unlock(folio); else if (submitted) nwritten++; @@ -2207,7 +2242,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { - .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -2269,24 +2303,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int i; - bool ret = true; - - f2fs_down_read(&nm_i->nat_tree_lock); - for (i = 0; i < nm_i->nat_blocks; i++) { - if (!test_bit_le(i, nm_i->nat_block_bitmap)) { - ret = false; - break; - } - } - f2fs_up_read(&nm_i->nat_tree_lock); - - return ret; -} - static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2717,7 +2733,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -2965,23 +2981,7 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, - unsigned int valid) -{ - if (valid == 0) { - __set_bit_le(nat_ofs, nm_i->empty_nat_bits); - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); - return; - } - - __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_ofs, nm_i->full_nat_bits); - else - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); -} - -static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2990,7 +2990,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; if (nat_index == 0) { @@ -3001,36 +3001,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - - __update_nat_bits(nm_i, nat_index, valid); -} - -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs; - - f2fs_down_read(&nm_i->nat_tree_lock); - - for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { - unsigned int valid = 0, nid_ofs = 0; - - /* handle nid zero due to it should never be used */ - if (unlikely(nat_ofs == 0)) { - valid = 1; - nid_ofs = 1; - } - - for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { - if (!test_bit_le(nid_ofs, - nm_i->free_nid_bitmap[nat_ofs])) - valid++; - } - - __update_nat_bits(nm_i, nat_ofs, valid); + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; } - f2fs_up_read(&nm_i->nat_tree_lock); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3049,7 +3030,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if ((cpc->reason & CP_UMOUNT) || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -3096,7 +3077,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - update_nat_bits(sbi, start_nid, page); + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3127,7 +3108,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (cpc->reason & CP_UMOUNT) { + if (enabled_nat_bits(sbi, cpc)) { f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); f2fs_up_write(&nm_i->nat_tree_lock); @@ -3143,7 +3124,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason & CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3180,18 +3161,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; + if (!enabled_nat_bits(sbi, NULL)) + return 0; + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) - return 0; - nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3208,12 +3186,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", - cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + disable_nat_bits(sbi, true); return 0; } + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3224,7 +3203,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; for (i = 0; i < nm_i->nat_blocks; i++) { @@ -3296,6 +3275,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + if (!test_opt(sbi, NAT_BITS)) + disable_nat_bits(sbi, true); + err = __get_nat_bitmaps(sbi); if (err) return err; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 6aea13024ac1..103a437e6425 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,6 +52,13 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, +}; + /* * For node information */ @@ -248,7 +255,7 @@ static inline nid_t nid_of_node(struct page *node_page) return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(struct page *node_page) +static inline unsigned int ofs_of_node(const struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); @@ -342,7 +349,7 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(struct page *node_page) +static inline bool IS_DNODE(const struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); @@ -389,7 +396,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(struct page *page, int type) +static inline int is_node(const struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & BIT(type); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c282e8a0a2ec..396ef71f41e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2096,7 +2096,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; if (!force) { - if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || + (!se->valid_blocks && + !IS_CURSEG(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2320,10 +2322,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || + F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) @@ -2806,7 +2807,7 @@ find_other_zone: MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; - f2fs_bug_on(sbi, 1); + f2fs_bug_on(sbi, !pinning); goto out_unlock; } } @@ -2848,7 +2849,7 @@ got_it: out_unlock: spin_unlock(&free_i->segmap_lock); - if (ret == -ENOSPC) + if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } @@ -2921,6 +2922,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return curseg->segno; } +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -2939,7 +2947,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) - curseg->segno = NULL_SEGNO; + reset_curseg_fields(curseg); return ret; } @@ -3710,13 +3718,6 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -static void reset_curseg_fields(struct curseg_info *curseg) -{ - curseg->inited = false; - curseg->segno = NULL_SEGNO; - curseg->next_segno = 0; -} - int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3902,6 +3903,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { + struct folio *folio = page_folio(fio->page); enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3912,10 +3914,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { - if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) + if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - end_page_writeback(fio->page); - if (f2fs_in_warm_node_list(fio->sbi, fio->page)) + folio_end_writeback(folio); + if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } @@ -4154,22 +4156,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, f2fs_update_data_blkaddr(dn, new_addr); } -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked) +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked) { - if (folio_test_writeback(page_folio(page))) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (folio_test_writeback(folio)) { + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ - f2fs_submit_merged_ipu_write(sbi, NULL, page); + f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page); if (ordered) { - wait_on_page_writeback(page); - f2fs_bug_on(sbi, locked && - folio_test_writeback(page_folio(page))); + folio_wait_writeback(folio); + f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { - wait_for_stable_page(page); + folio_wait_stable(folio); } } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 943be4f1d6d2..0465dc00b349 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -559,13 +559,16 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) { - unsigned int segno, left_blocks, blocks; int i; /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); @@ -576,6 +579,10 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 83d6fb97dcae..9c8d3aee89af 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, mutex_unlock(&sbi->umount_mutex); } spin_unlock(&f2fs_list_lock); - return count; + return count ?: SHRINK_EMPTY; } unsigned long f2fs_shrink_scan(struct shrinker *shrink, @@ -130,6 +130,96 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, return freed; } +unsigned int f2fs_donate_files(void) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int donate_files = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + donate_files += sbi->donate_files; + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + + return donate_files; +} + +static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, + unsigned int reclaim_caches_kb) +{ + struct inode *inode; + struct f2fs_inode_info *fi; + unsigned int nfiles = sbi->donate_files; + pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10); + + while (npages && nfiles--) { + pgoff_t len; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + invalidate_inode_pages2_range(inode->i_mapping, + fi->donate_start, fi->donate_end); + iput(inode); + cond_resched(); + } + return npages << (PAGE_SHIFT - 10); +} + +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list && reclaim_caches_kb) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); +} + void f2fs_join_shrinker(struct f2fs_sb_info *sbi) { spin_lock(&f2fs_list_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19b67828ae32..f087b2b71c89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -63,6 +63,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", + [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, @@ -190,6 +191,7 @@ enum { Opt_memory_mode, Opt_age_extent_cache, Opt_errors, + Opt_nat_bits, Opt_err, }; @@ -269,6 +271,7 @@ static match_table_t f2fs_tokens = { {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, {Opt_errors, "errors=%s"}, + {Opt_nat_bits, "nat_bits"}, {Opt_err, NULL}, }; @@ -383,10 +386,10 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct super_block *sb, int qtype, +static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, substring_t *args) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; char *qname; int ret = -EINVAL; @@ -424,9 +427,9 @@ errout: return ret; } -static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); @@ -483,12 +486,11 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int f2fs_set_test_dummy_encryption(struct super_block *sb, +static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, const char *opt, const substring_t *arg, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); struct fs_parameter param = { .type = fs_value_is_string, .string = arg->from ? arg->from : "", @@ -671,9 +673,8 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) #endif #endif -static int parse_options(struct super_block *sb, char *options, bool is_remount) +static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -687,7 +688,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - goto default_check; + return 0; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -728,10 +729,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: - /* this option mounts f2fs with ro */ + /* requires ro mount, checked in f2fs_default_check */ set_opt(sbi, NORECOVERY); - if (!f2fs_readonly(sb)) - return -EINVAL; break; case Opt_discard: if (!f2fs_hw_support_discard(sbi)) { @@ -772,16 +771,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_user_xattr: - f2fs_info(sbi, "user_xattr options not supported"); - break; case Opt_nouser_xattr: - f2fs_info(sbi, "nouser_xattr options not supported"); - break; case Opt_inline_xattr: - f2fs_info(sbi, "inline_xattr options not supported"); - break; case Opt_noinline_xattr: - f2fs_info(sbi, "noinline_xattr options not supported"); + case Opt_inline_xattr_size: + f2fs_info(sbi, "xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -793,10 +787,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_acl: - f2fs_info(sbi, "acl options not supported"); - break; case Opt_noacl: - f2fs_info(sbi, "noacl options not supported"); + f2fs_info(sbi, "acl options not supported"); break; #endif case Opt_active_logs: @@ -838,7 +830,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + if (f2fs_sb_has_device_alias(sbi)) { f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } @@ -919,18 +911,15 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_fault_injection: - f2fs_info(sbi, "fault_injection options not supported"); - break; - case Opt_fault_type: - f2fs_info(sbi, "fault_type options not supported"); + f2fs_info(sbi, "fault injection options not supported"); break; #endif case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, LAZYTIME); break; case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; + clear_opt(sbi, LAZYTIME); break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -944,32 +933,32 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, PRJQUOTA); break; case Opt_usrjquota: - ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); if (ret) return ret; break; case Opt_grpjquota: - ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); if (ret) return ret; break; case Opt_prjjquota: - ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); if (ret) return ret; break; case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sb, USRQUOTA); + ret = f2fs_clear_qf_name(sbi, USRQUOTA); if (ret) return ret; break; case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sb, GRPQUOTA); + ret = f2fs_clear_qf_name(sbi, GRPQUOTA); if (ret) return ret; break; case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sb, PRJQUOTA); + ret = f2fs_clear_qf_name(sbi, PRJQUOTA); if (ret) return ret; break; @@ -1039,14 +1028,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], is_remount); if (ret) return ret; break; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; + set_opt(sbi, INLINECRYPT); #else f2fs_info(sbi, "inline encryption not supported"); #endif @@ -1322,13 +1311,20 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_nat_bits: + set_opt(sbi, NAT_BITS); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } -default_check: + return 0; +} + +static int f2fs_default_check(struct f2fs_sb_info *sbi) +{ #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1418,6 +1414,12 @@ default_check: f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + + if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "norecovery requires readonly mount"); + return -EINVAL; + } + return 0; } @@ -1441,6 +1443,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->gdonate_list); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1527,6 +1530,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (!ret && f2fs_is_atomic_file(inode)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + return ret; } @@ -1737,22 +1744,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (f2fs_readonly(sb)) return 0; /* IO error happened before */ - if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; /* must be clean, since sync_filesystem() was already called */ - if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY)) return -EINVAL; + sbi->umount_lock_holder = current; + /* Let's flush checkpoints and stop the thread. */ - f2fs_flush_ckpt_thread(F2FS_SB(sb)); + f2fs_flush_ckpt_thread(sbi); + + sbi->umount_lock_holder = NULL; /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ - set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + set_sbi_flag(sbi, SBI_IS_FREEZING); return 0; } @@ -1836,7 +1849,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; spin_lock(&sbi->stat_lock); - + if (sbi->carve_out) + buf->f_blocks -= sbi->current_reserved_blocks; user_block_count = sbi->user_block_count; total_valid_node_count = valid_node_count(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -2128,6 +2142,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) seq_printf(seq, ",errors=%s", "panic"); + if (test_opt(sbi, NAT_BITS)) + seq_puts(seq, ",nat_bits"); + return 0; } @@ -2175,8 +2192,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, MERGE_CHECKPOINT); + set_opt(sbi, LAZYTIME); F2FS_OPTION(sbi).unusable_cap = 0; - sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_has_blkzoned(sbi)) @@ -2318,6 +2335,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + bool no_nat_bits = !test_opt(sbi, NAT_BITS); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2329,6 +2347,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; + sbi->umount_lock_holder = current; + #ifdef CONFIG_QUOTA org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -2359,7 +2379,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); /* parse mount options */ - err = parse_options(sb, data, true); + err = parse_options(sbi, data, true); if (err) goto restore_opts; @@ -2374,6 +2394,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } #endif + err = f2fs_default_check(sbi); + if (err) + goto restore_opts; + /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); @@ -2444,6 +2468,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) { + err = -EINVAL; + f2fs_warn(sbi, "switch nat_bits option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -2552,6 +2582,8 @@ skip: limit_reserve_root(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + + sbi->umount_lock_holder = NULL; return 0; restore_checkpoint: if (need_enable_checkpoint) { @@ -2592,6 +2624,8 @@ restore_opts: #endif sbi->mount_opt = org_mount_opt; sb->s_flags = old_sb_flags; + + sbi->umount_lock_holder = NULL; return err; } @@ -2908,7 +2942,7 @@ out: return ret; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); @@ -2956,11 +2990,21 @@ int f2fs_quota_sync(struct super_block *sb, int type) return ret; } +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + int ret; + + F2FS_SB(sb)->umount_lock_holder = current; + ret = f2fs_do_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = NULL; + return ret; +} + static int f2fs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { struct inode *inode; - int err; + int err = 0; /* if quota sysfile exists, deny enabling quota with specific file */ if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { @@ -2971,31 +3015,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) return -EXDEV; - err = f2fs_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = current; + + err = f2fs_do_quota_sync(sb, type); if (err) - return err; + goto out; inode = d_inode(path->dentry); err = filemap_fdatawrite(inode->i_mapping); if (err) - return err; + goto out; err = filemap_fdatawait(inode->i_mapping); if (err) - return err; + goto out; err = dquot_quota_on(sb, type, format_id, path); if (err) - return err; + goto out; inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); - - return 0; +out: + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } static int __f2fs_quota_off(struct super_block *sb, int type) @@ -3006,7 +3053,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - err = f2fs_quota_sync(sb, type); + err = f2fs_do_quota_sync(sb, type); if (err) goto out_put; @@ -3029,6 +3076,8 @@ static int f2fs_quota_off(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + F2FS_SB(sb)->umount_lock_holder = current; + err = __f2fs_quota_off(sb, type); /* @@ -3038,6 +3087,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } @@ -3170,7 +3222,7 @@ int f2fs_dquot_initialize(struct inode *inode) return 0; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { return 0; } @@ -4220,6 +4272,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + else + dump_stack(); /* * Continue filesystem operators if errors=continue. Should not set @@ -4495,7 +4549,11 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options, false); + err = parse_options(sbi, options, false); + if (err) + goto free_options; + + err = f2fs_default_check(sbi); if (err) goto free_options; @@ -4533,6 +4591,14 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + if (test_opt(sbi, INLINECRYPT)) + sb->s_flags |= SB_INLINECRYPT; + + if (test_opt(sbi, LAZYTIME)) + sb->s_flags |= SB_LAZYTIME; + else + sb->s_flags &= ~SB_LAZYTIME; + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); super_set_sysfs_name_bdev(sb); sb->s_iflags |= SB_I_CGROUPWB; @@ -4703,6 +4769,7 @@ try_onemore: if (err) goto free_compress_inode; + sbi->umount_lock_holder = current; #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4718,8 +4785,10 @@ try_onemore: if (err) goto free_meta; - if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) { + skip_recovery = true; goto reset_checkpoint; + } /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && @@ -4769,10 +4838,10 @@ try_onemore: } } +reset_checkpoint: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif -reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * write pointer consistency of cursegs and other zones are already @@ -4829,6 +4898,8 @@ reset_checkpoint: f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + sbi->umount_lock_holder = NULL; return 0; sync_free_meta: @@ -4931,6 +5002,8 @@ static void kill_f2fs_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb->s_root) { + sbi->umount_lock_holder = current; + set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d15c68b28952..c69161366467 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -880,18 +905,50 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } +static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) +{ + unsigned int res = 0; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + res = f2fs_donate_files(); + + return sysfs_emit(buf, "%u\n", res); +} + +static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + f2fs_reclaim_caches(t); + + return count; +} + +#define F2FS_TUNE_RW_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0644 }, \ + .show = f2fs_tune_show, \ + .store = f2fs_tune_store, \ +} + static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1065,6 +1122,7 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif +F2FS_SBI_GENERAL_RW_ATTR(carve_out); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1252,41 +1310,43 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), + ATTR_LIST(carve_out), NULL, }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1343,6 +1403,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_sb_feat); +F2FS_TUNE_RW_ATTR(reclaim_caches_kb); + +static struct attribute *f2fs_tune_attrs[] = { + BASE_ATTR_LIST(reclaim_caches_kb), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_tune); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -1362,15 +1430,34 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static const struct sysfs_ops f2fs_tune_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_tune_ktype = { + .default_groups = f2fs_tune_groups, + .sysfs_ops = &f2fs_tune_attr_ops, +}; + +static struct kobject f2fs_tune = { + .kset = &f2fs_kset, +}; + static ssize_t f2fs_stat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1607,6 +1694,11 @@ int __init f2fs_init_sysfs(void) if (ret) goto put_kobject; + ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, + NULL, "tuning"); + if (ret) + goto put_kobject; + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); if (!f2fs_proc_root) { ret = -ENOMEM; @@ -1614,7 +1706,9 @@ int __init f2fs_init_sysfs(void) } return 0; + put_kobject: + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); return ret; @@ -1622,6 +1716,7 @@ put_kobject: void f2fs_exit_sysfs(void) { + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3f3874943679..c691b35618ad 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -282,7 +282,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = f2fs_get_node_page(sbi, inode->i_ino); + page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -303,7 +303,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = f2fs_get_node_page(sbi, xnid); + xpage = f2fs_get_xnode_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -449,7 +449,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = f2fs_get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); @@ -475,7 +475,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); f2fs_alloc_nid_failed(sbi, new_nid); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac..cc57367fb641 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -65,7 +65,7 @@ struct wb_writeback_work { * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ -unsigned int dirtytime_expire_interval = 12 * 60 * 60; +static unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { @@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } -static int __init start_dirtytime_writeback(void) -{ - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); - return 0; -} -__initcall(start_dirtytime_writeback); - -int dirtytime_interval_handler(const struct ctl_table *table, int write, +static int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write, return ret; } +static const struct ctl_table vm_fs_writeback_table[] = { + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, +}; + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + register_sysctl_init("vm", vm_fs_writeback_table); + return 0; +} +__initcall(start_dirtytime_writeback); + /** * __mark_inode_dirty - internal function to mark an inode dirty * diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9bb3be21d2b..fd1147aa3891 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -820,7 +820,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger @@ -885,7 +885,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * For writes, iomap_dio_rw only triggers manual page faults, so we @@ -957,7 +957,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ @@ -1024,7 +1024,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 65c07aa95718..d7220a6fe8f5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -607,14 +607,19 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) if (gh && (ret & LM_OUT_CANCELED)) gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { - /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { - list_move_tail(&gh->gh_list, &gl->gl_holders); + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gl->gl_target = gl->gl_state; gh = find_first_waiter(gl); - gl->gl_target = gh->gh_state; - if (do_promote(gl)) - goto out; - goto retry; + if (gh) { + gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; + do_xmote(gl, gh, gl->gl_target); + return; + } + goto out; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || @@ -627,7 +632,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) switch(state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: -retry: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ @@ -661,7 +665,8 @@ retry: do_promote(gl); } out: - clear_bit(GLF_LOCK, &gl->gl_flags); + if (!test_bit(GLF_CANCELING, &gl->gl_flags)) + clear_bit(GLF_LOCK, &gl->gl_flags); } static bool is_system_glock(struct gfs2_glock *gl) @@ -807,6 +812,7 @@ skip_inval: } if (ls->ls_ops->lm_lock) { + set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); @@ -825,6 +831,7 @@ skip_inval: /* The operation will be completed asynchronously. */ return; } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ @@ -843,12 +850,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -860,18 +868,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); @@ -898,12 +910,8 @@ void glock_set_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) gfs2_dump_glock(NULL, gl, true); - } } /** @@ -919,12 +927,8 @@ void glock_clear_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) gfs2_dump_glock(NULL, gl, true); - } } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) @@ -959,6 +963,25 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) gfs2_holder_uninit(&gh); } +static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + wait_on_inode(&ip->i_inode); + if (is_bad_inode(&ip->i_inode)) { + iput(&ip->i_inode); + ip = NULL; + } + } + return ip; +} + static void gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; @@ -976,32 +999,15 @@ static void gfs2_try_evict(struct gfs2_glock *gl) * happened below. (Verification is triggered by the call to * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip && !igrab(&ip->i_inode)) - ip = NULL; - spin_unlock(&gl->gl_lockref.lock); - if (ip) { - wait_on_inode(&ip->i_inode); - if (is_bad_inode(&ip->i_inode)) { - iput(&ip->i_inode); - ip = NULL; - } - } + ip = gfs2_grab_existing_inode(gl); if (ip) { - set_bit(GIF_DEFER_DELETE, &ip->i_flags); + set_bit(GLF_DEFER_DELETE, &gl->gl_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); + clear_bit(GLF_DEFER_DELETE, &gl->gl_flags); /* If the inode was evicted, gl->gl_object will now be NULL. */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip) { - clear_bit(GIF_DEFER_DELETE, &ip->i_flags); - if (!igrab(&ip->i_inode)) - ip = NULL; - } - spin_unlock(&gl->gl_lockref.lock); + ip = gfs2_grab_existing_inode(gl); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); @@ -1462,9 +1468,7 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; - if (gh->gh_state == LM_ST_UNLOCKED) - return true; - return false; + return !test_bit(HIF_HOLDER, &gh->gh_iflags); } /** @@ -1483,7 +1487,6 @@ __acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; @@ -1519,21 +1522,11 @@ fail: gfs2_holder_wake(gh); return; } - if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) - continue; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); - if (likely(insert_pt == NULL)) { - list_add_tail(&gh->gh_list, &gl->gl_holders); - return; - } - list_add_tail(&gh->gh_list, insert_pt); - spin_unlock(&gl->gl_lockref.lock); - if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_lockref.lock); + list_add_tail(&gh->gh_list, &gl->gl_holders); return; trap_recursive: @@ -1673,11 +1666,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } if (list_is_first(&gh->gh_list, &gl->gl_holders) && - !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + test_bit(GLF_LOCK, &gl->gl_flags) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_CANCELING, &gl->gl_flags)) { + set_bit(GLF_CANCELING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_CANCELING, &gl->gl_flags); + clear_bit(GLF_LOCK, &gl->gl_flags); + if (!gfs2_holder_queued(gh)) + goto out; } /* @@ -1923,6 +1924,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { @@ -2323,6 +2325,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'f'; if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) *p++ = 'i'; + if (test_bit(GLF_PENDING_REPLY, gflags)) + *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) @@ -2347,6 +2351,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'e'; if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; + if (test_bit(GLF_DEFER_DELETE, gflags)) + *p++ = 's'; + if (test_bit(GLF_CANCELING, gflags)) + *p++ = 'C'; *p = 0; return buf; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4e19cce3d906..74abbd4970f8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -330,6 +330,9 @@ enum { GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ GLF_VERIFY_DELETE = 18, /* iopen glocks only */ + GLF_PENDING_REPLY = 19, + GLF_DEFER_DELETE = 20, /* iopen glocks only */ + GLF_CANCELING = 21, }; struct gfs2_glock { @@ -376,7 +379,6 @@ enum { GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, - GIF_DEFER_DELETE = 7, }; struct gfs2_inode { diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 314ec2a70167..0fd3b5ec7d8c 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -157,7 +157,9 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) /** * gfs2_end_log_write_bh - end log write of pagecache data with buffers * @sdp: The superblock - * @bvec: The bio_vec + * @folio: The folio + * @offset: The first byte within the folio that completed + * @size: The number of bytes that completed * @error: The i/o status * * This finds the relevant buffers and unlocks them and sets the @@ -166,17 +168,13 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) * that is pinned in the pagecache. */ -static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, - struct bio_vec *bvec, - blk_status_t error) +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct folio *folio, + size_t offset, size_t size, blk_status_t error) { struct buffer_head *bh, *next; - struct page *page = bvec->bv_page; - unsigned size; - bh = page_buffers(page); - size = bvec->bv_len; - while (bh_offset(bh) < bvec->bv_offset) + bh = folio_buffers(folio); + while (bh_offset(bh) < offset) bh = bh->b_this_page; do { if (error) @@ -186,7 +184,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, size -= bh->b_size; brelse(bh); bh = next; - } while(bh && size); + } while (bh && size); } /** @@ -203,7 +201,6 @@ static void gfs2_end_log_write(struct bio *bio) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; - struct page *page; struct bvec_iter_all iter_all; if (bio->bi_status) { @@ -217,9 +214,12 @@ static void gfs2_end_log_write(struct bio *bio) } bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; - if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); + struct page *page = bvec->bv_page; + struct folio *folio = page_folio(page); + + if (folio && folio_buffers(folio)) + gfs2_end_log_write_bh(sdp, folio, bvec->bv_offset, + bvec->bv_len, bio->bi_status); else mempool_free(page, gfs2_page_pool); } @@ -359,8 +359,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, - bh_offset(bh), dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, folio_page(bh->b_folio, 0), + bh->b_size, bh_offset(bh), dblock); } /** @@ -406,17 +406,16 @@ static void gfs2_end_log_read(struct bio *bio) } /** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * gfs2_jhead_folio_search - Look for the journal head in a given page. * @jd: The journal descriptor * @head: The journal head to start from - * @page: The page to look in + * @folio: The folio to look in * * Returns: 1 if found, 0 otherwise. */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) +static bool gfs2_jhead_folio_search(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct folio *folio) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host lh; @@ -424,7 +423,8 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, unsigned int offset; bool ret = false; - kaddr = kmap_local_page(page); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + kaddr = kmap_local_folio(folio, 0); for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { if (lh.lh_sequence >= head->lh_sequence) @@ -472,7 +472,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, &folio->page); + *done = gfs2_jhead_folio_search(jd, head, folio); /* filemap_get_folio() and the earlier grab_cache_page() */ folio_put_refs(folio, 2); @@ -512,9 +512,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int shift = PAGE_SHIFT - bsize_shift; unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; - int sz, ret = 0; + int ret = 0; struct bio *bio = NULL; - struct page *page = NULL; + struct folio *folio = NULL; bool done = false; errseq_t since; @@ -527,10 +527,11 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, u64 dblock = je->dblock; for (; block < je->lblock + je->blocks; block++, dblock++) { - if (!page) { - page = grab_cache_page(mapping, block >> shift); - if (!page) { - ret = -ENOMEM; + if (!folio) { + folio = filemap_grab_folio(mapping, + block >> shift); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); done = true; goto out; } @@ -541,8 +542,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, sector_t sector = dblock << sdp->sd_fsb2bb_shift; if (bio_end_sector(bio) == sector) { - sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) + if (bio_add_folio(bio, folio, bsize, off)) goto block_added; } if (off) { @@ -562,12 +562,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; add_block_to_new_bio: - sz = bio_add_page(bio, page, bsize, off); - BUG_ON(sz != bsize); + if (!bio_add_folio(bio, folio, bsize, off)) + BUG(); block_added: off += bsize; - if (off == PAGE_SIZE) - page = NULL; + if (off == folio_size(folio)) + folio = NULL; if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; @@ -615,15 +615,13 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, static void gfs2_check_magic(struct buffer_head *bh) { - void *kaddr; __be32 *ptr; clear_buffer_escaped(bh); - kaddr = kmap_local_page(bh->b_page); - ptr = kaddr + bh_offset(bh); + ptr = kmap_local_folio(bh->b_folio, bh_offset(bh)); if (*ptr == cpu_to_be32(GFS2_MAGIC)) set_buffer_escaped(bh); - kunmap_local(kaddr); + kunmap_local(ptr); } static int blocknr_cmp(void *priv, const struct list_head *a, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fea3efcc2f93..198cc7056637 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -198,15 +198,14 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) static void gfs2_meta_read_endio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct buffer_head *bh = page_buffers(page); - unsigned int len = bvec->bv_len; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct buffer_head *bh = folio_buffers(folio); + size_t len = fi.length; - while (bh_offset(bh) < bvec->bv_offset) + while (bh_offset(bh) < fi.offset) bh = bh->b_this_page; do { struct buffer_head *next = bh->b_this_page; @@ -232,7 +231,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); while (num > 0) { bh = *bhs; - if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) { BUG_ON(bio->bi_iter.bi_size == 0); break; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 92a3b6ddafdc..44e5658b896c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1329,7 +1329,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) goto should_delete; - if (test_bit(GIF_DEFER_DELETE, &ip->i_flags)) + if (gfs2_holder_initialized(&ip->i_iopen_gh) && + test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags)) return EVICT_SHOULD_DEFER_DELETE; /* Deletes should never happen under memory pressure anymore. */ @@ -1338,12 +1339,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, /* Must not read inode block until block type has been verified */ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); - if (unlikely(ret)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - return EVICT_SHOULD_DEFER_DELETE; - } + if (unlikely(ret)) + return EVICT_SHOULD_SKIP_DELETE; if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) return EVICT_SHOULD_SKIP_DELETE; @@ -1363,15 +1360,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && - test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - enum evict_behavior behavior = - gfs2_upgrade_iopen_glock(inode); - - if (behavior != EVICT_SHOULD_DELETE) { - gfs2_holder_uninit(&ip->i_iopen_gh); - return behavior; - } - } + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + return gfs2_upgrade_iopen_glock(inode); return EVICT_SHOULD_DELETE; } @@ -1509,7 +1499,7 @@ static void gfs2_evict_inode(struct inode *inode) gfs2_glock_put(io_gl); goto out; } - behavior = EVICT_SHOULD_DELETE; + behavior = EVICT_SHOULD_SKIP_DELETE; } if (behavior == EVICT_SHOULD_DELETE) ret = evict_unlinked_inode(inode); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 8eae8d62a413..26036ffc3f33 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -53,12 +53,20 @@ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ {(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ - {(1UL << GLF_BLOCKING), "b" }) + {(1UL << GLF_BLOCKING), "b" }, \ + {(1UL << GLF_UNLOCKED), "x" }, \ + {(1UL << GLF_INSTANTIATE_NEEDED), "n" }, \ + {(1UL << GLF_INSTANTIATE_IN_PROG), "N" }, \ + {(1UL << GLF_TRY_TO_EVICT), "e" }, \ + {(1UL << GLF_VERIFY_DELETE), "E" }, \ + {(1UL << GLF_DEFER_DELETE), "s" }, \ + {(1UL << GLF_CANCELING), "C" }) #ifndef NUMPTY #define NUMPTY diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 192213c7359a..f8ae2c666fd6 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -246,12 +246,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - lock_page(bh->b_page); + folio_lock(bh->b_folio); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; - unlock_page(bh->b_page); + folio_unlock(bh->b_folio); lock_buffer(bh); gfs2_log_lock(sdp); } diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index eb2f8273e6f1..09df40b612fb 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -147,7 +147,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, de = tmpde; } /* Basic sanity check, whether name doesn't exceed dir entry */ - if (de_len < de->name_len[0] + + if (de_len < sizeof(struct iso_directory_record) || + de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e8e80761ac73..1c7c49356878 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -738,10 +738,8 @@ start_journal_io: err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); err = 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d8084b31b361..a5ccba25ff47 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) { int ret = 0; - transaction_t *commit_trans; + transaction_t *commit_trans, *running_trans; if (!(journal->j_flags & JBD2_BARRIER)) return 0; @@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) goto out; commit_trans = journal->j_committing_transaction; if (!commit_trans || commit_trans->t_tid != tid) { + running_trans = journal->j_running_transaction; + /* + * The query transaction hasn't started committing, + * it must still be running. + */ + if (WARN_ON_ONCE(!running_trans || + running_trans->t_tid != tid)) + goto out; + + running_trans->t_need_data_flush = 1; ret = 1; goto out; } @@ -947,7 +957,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * descriptor blocks we do need to generate bona fide buffers. * * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ @@ -1361,7 +1371,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal) && + if (jbd2_journal_has_csum_v2or3(journal) && jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " @@ -1369,7 +1379,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (jbd2_journal_has_csum_v2or3(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); return err; @@ -1869,7 +1879,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, /* Log is no longer empty */ write_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); @@ -1965,17 +1974,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) return err; } - if (block_start == ~0ULL) { - block_start = phys_block; - block_stop = block_start - 1; - } + if (block_start == ~0ULL) + block_stop = block_start = phys_block; /* * last block not contiguous with current block, * process last contiguous region and return to this block on * next loop */ - if (phys_block != block_stop + 1) { + if (phys_block != block_stop) { block--; } else { block_stop++; @@ -1994,11 +2001,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) */ byte_start = block_start * journal->j_blocksize; byte_stop = block_stop * journal->j_blocksize; - byte_count = (block_stop - block_start + 1) * - journal->j_blocksize; + byte_count = (block_stop - block_start) * journal->j_blocksize; truncate_inode_pages_range(journal->j_dev->bd_mapping, - byte_start, byte_stop); + byte_start, byte_stop - 1); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { err = blkdev_issue_discard(journal->j_dev, @@ -2013,7 +2019,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) } if (unlikely(err != 0)) { - pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", err, block_start, block_stop); return err; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9192be7c19d8..c271a050b7e6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -39,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n) */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static void do_readahead(journal_t *journal, unsigned int start) { - int err; unsigned int max, nbufs, next; unsigned long long blocknr; struct buffer_head *bh; @@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start) nbufs = 0; for (next = start; next < max; next++) { - err = jbd2_journal_bmap(journal, next, &blocknr); + int err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { printk(KERN_ERR "JBD2: bad block at offset %u\n", @@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start) } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; + if (!bh) goto failed; - } if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; @@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start) if (nbufs) bh_readahead_batch(nbufs, bufs, 0); - err = 0; failed: if (nbufs) journal_brelse_array(bufs, nbufs); - return err; } #endif /* __KERNEL__ */ @@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -852,6 +879,13 @@ chksum_ok: case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -863,12 +897,7 @@ chksum_ok: need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) - continue; - - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); if (err) goto failed; @@ -922,8 +951,9 @@ chksum_ok: /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index ce63d5fde9c3..0cf0fddbee81 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void) return 0; } -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; @@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; @@ -245,7 +245,7 @@ out: return table; } -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; @@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) J_ASSERT(list_empty(hash_list)); } - kfree(table->hash_table); + kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } @@ -420,12 +420,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ -int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; - int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -450,7 +449,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; } } @@ -473,11 +471,10 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) __brelse(bh2); } } - return did_revoke; } /* - * journal_clear_revoked_flag clears revoked flag of buffers in + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ @@ -506,9 +503,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) } } -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz +/* jbd2_journal_switch_revoke_table table select j_revoke for next + * transaction we do not want to suspend any processing until all + * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 66513c18ca29..cbc4785462f5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal, atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); - INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); @@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal, */ /* - * Update transaction's maximum wait time, if debugging is enabled. - * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. - * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) @@ -2079,21 +2075,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) jh->b_transaction = NULL; } -void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - spin_lock(&jh->b_state_lock); - spin_lock(&journal->j_list_lock); - __jbd2_journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - spin_unlock(&jh->b_state_lock); - jbd2_journal_put_journal_head(jh); - __brelse(bh); -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2192,7 +2173,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in - * __journal_file_buffer + * __jbd2_journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 07cfdc440596..60fc92dee24d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -369,7 +369,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) ASSERT(length >= 0); - if (test_cflag(COMMIT_Nolink, ip)) { + if (test_cflag(COMMIT_Nolink, ip) || isReadOnly(ip)) { xtTruncate(0, ip, length, COMMIT_WMAP); return; } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f9009e4f9ffd..26e89d0c69b6 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -178,41 +178,26 @@ int dbMount(struct inode *ipbmap) dbmp_le = (struct dbmap_disk *) mp->data; bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); - bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); - if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE || - bmp->db_l2nbperpage < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag || bmp->db_numag > MAXAG) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); - if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 || - bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); - if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || - bmp->db_agl2size < 0) { - err = -EINVAL; - goto err_release_metapage; - } - if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) || + (bmp->db_l2nbperpage < 0) || + !bmp->db_numag || (bmp->db_numag > MAXAG) || + (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) || + (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) || + !bmp->db_agwidth || + (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) || + (bmp->db_agl2size < 0) || + ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { err = -EINVAL; goto err_release_metapage; } @@ -3403,7 +3388,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) oldl2agsize = bmp->db_agl2size; bmp->db_agl2size = l2agsize; - bmp->db_agsize = 1 << l2agsize; + bmp->db_agsize = (s64)1 << l2agsize; /* compute new number of AG */ agno = bmp->db_numag; @@ -3666,8 +3651,8 @@ void dbFinalizeBmap(struct inode *ipbmap) * system size is not a multiple of the group size). */ inactfree = (inactags && ag_rem) ? - ((inactags - 1) << bmp->db_agl2size) + ag_rem - : inactags << bmp->db_agl2size; + (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem + : ((s64)inactags << bmp->db_agl2size); /* determine how many free blocks are in the active * allocation groups plus the average number of free blocks diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8f85177f284b..93db6eec4465 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -117,7 +117,8 @@ do { \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ - ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ + ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 63d21822d309..46529bcc8297 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -74,6 +74,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) int rc; int xflag; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + /* This blocks if we are low on resources */ txBeginAnon(ip->i_sb); @@ -253,6 +258,11 @@ int extRecord(struct inode *ip, xad_t * xp) { int rc; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + txBeginAnon(ip->i_sb); mutex_lock(&JFS_IP(ip)->commit_mutex); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a360b24ed320..ecb8e05b8b84 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -102,7 +102,7 @@ int diMount(struct inode *ipimap) * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ - imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + imap = kzalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) return -ENOMEM; @@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ - if ((copy_from_dinode(dp, ip)) != 0) { + if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) { /* handle bad return by returning NULL for ip */ set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); @@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno, * * RETURN VALUES: * 0 - success - * -ENOMEM - insufficient memory + * -EINVAL - unexpected inode type */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int fileset = le32_to_cpu(dip->di_fileset); + + switch (fileset) { + case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I: + case LOG_I: case BADBLOCK_I: case FILESYSTEM_I: + break; + default: + return -EINVAL; + } - jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->fileset = fileset; jfs_ip->mode2 = le32_to_cpu(dip->di_mode); jfs_set_inode_flags(ip); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 223d9ac59839..10368c188c5e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -389,8 +389,8 @@ static int jfs_reconfigure(struct fs_context *fc) if (!ctx->newLVSize) { ctx->newLVSize = sb_bdev_nr_blocks(sb); - if (ctx->newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); } rc = jfs_extendfs(sb, ctx->newLVSize, 0); @@ -766,7 +766,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24afbae87225..11d7f74d207b 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { - int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); - - printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, size, 1); + if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) { + printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n", + EALIST_SIZE(ea_buf->xattr)); + } else { + int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); + + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, size, 1); + } ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index fe3e23dd29c3..51bbe22d21e3 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -8,6 +8,6 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o + svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o lockd-$(CONFIG_PROC_FS) += procfs.o diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c new file mode 100644 index 000000000000..6e00b02cad90 --- /dev/null +++ b/fs/lockd/netlink.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/lockd.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink.h" + +#include <uapi/linux/lockd_netlink.h> + +/* LOCKD_CMD_SERVER_SET - do */ +static const struct nla_policy lockd_server_set_nl_policy[LOCKD_A_SERVER_UDP_PORT + 1] = { + [LOCKD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, + [LOCKD_A_SERVER_TCP_PORT] = { .type = NLA_U16, }, + [LOCKD_A_SERVER_UDP_PORT] = { .type = NLA_U16, }, +}; + +/* Ops table for lockd */ +static const struct genl_split_ops lockd_nl_ops[] = { + { + .cmd = LOCKD_CMD_SERVER_SET, + .doit = lockd_nl_server_set_doit, + .policy = lockd_server_set_nl_policy, + .maxattr = LOCKD_A_SERVER_UDP_PORT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = LOCKD_CMD_SERVER_GET, + .doit = lockd_nl_server_get_doit, + .flags = GENL_CMD_CAP_DO, + }, +}; + +struct genl_family lockd_nl_family __ro_after_init = { + .name = LOCKD_FAMILY_NAME, + .version = LOCKD_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = lockd_nl_ops, + .n_split_ops = ARRAY_SIZE(lockd_nl_ops), +}; diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h new file mode 100644 index 000000000000..1920543a7955 --- /dev/null +++ b/fs/lockd/netlink.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/lockd.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_LOCKD_GEN_H +#define _LINUX_LOCKD_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/lockd_netlink.h> + +int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info); +int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info); + +extern struct genl_family lockd_nl_family; + +#endif /* _LINUX_LOCKD_GEN_H */ diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 17432c445fe6..88e8e2a97397 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -10,6 +10,9 @@ struct lockd_net { unsigned int nlmsvc_users; unsigned long next_gc; unsigned long nrhosts; + u32 gracetime; + u16 tcp_port; + u16 udp_port; struct delayed_work grace_period_end; struct lock_manager lockd_manager; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2c8eedc6c2cc..e80262a51884 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -41,6 +41,7 @@ #include "netns.h" #include "procfs.h" +#include "netlink.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) @@ -83,8 +84,14 @@ static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; #endif -static unsigned long get_lockd_grace_period(void) +static unsigned long get_lockd_grace_period(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + /* Return the net-ns specific grace period, if there is one */ + if (ln->gracetime) + return ln->gracetime * HZ; + /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; @@ -103,7 +110,7 @@ static void grace_ender(struct work_struct *grace) static void set_grace_period(struct net *net) { - unsigned long grace_period = get_lockd_grace_period(); + unsigned long grace_period = get_lockd_grace_period(net); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); @@ -166,15 +173,16 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name, static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { + struct lockd_net *ln = net_generic(net, lockd_net_id); int err; - err = create_lockd_listener(serv, "udp", net, family, nlm_udpport, - cred); + err = create_lockd_listener(serv, "udp", net, family, + ln->udp_port ? ln->udp_port : nlm_udpport, cred); if (err < 0) return err; - return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport, - cred); + return create_lockd_listener(serv, "tcp", net, family, + ln->tcp_port ? ln->tcp_port : nlm_tcpport, cred); } /* @@ -459,9 +467,10 @@ static const struct ctl_table nlm_sysctls[] = { { .procname = "nsm_local_state", .data = &nsm_local_state, - .maxlen = sizeof(int), + .maxlen = sizeof(nsm_local_state), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, + .extra1 = SYSCTL_ZERO, }, }; @@ -588,6 +597,10 @@ static int __init init_nlm(void) if (err) goto err_pernet; + err = genl_register_family(&lockd_nl_family); + if (err) + goto err_netlink; + err = lockd_create_procfs(); if (err) goto err_procfs; @@ -595,6 +608,8 @@ static int __init init_nlm(void) return 0; err_procfs: + genl_unregister_family(&lockd_nl_family); +err_netlink: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL @@ -608,6 +623,7 @@ static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); + genl_unregister_family(&lockd_nl_family); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL @@ -710,3 +726,94 @@ static struct svc_program nlmsvc_program = { .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, }; + +/** + * lockd_nl_server_set_doit - set the lockd server parameters via netlink + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * This updates the per-net values. When updating the values in the init_net + * namespace, also update the "legacy" global values. + * + * Return 0 on success or a negative errno. + */ +int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct lockd_net *ln = net_generic(net, lockd_net_id); + const struct nlattr *attr; + + if (GENL_REQ_ATTR_CHECK(info, LOCKD_A_SERVER_GRACETIME)) + return -EINVAL; + + if (info->attrs[LOCKD_A_SERVER_GRACETIME] || + info->attrs[LOCKD_A_SERVER_TCP_PORT] || + info->attrs[LOCKD_A_SERVER_UDP_PORT]) { + attr = info->attrs[LOCKD_A_SERVER_GRACETIME]; + if (attr) { + u32 gracetime = nla_get_u32(attr); + + if (gracetime > nlm_grace_period_max) + return -EINVAL; + + ln->gracetime = gracetime; + + if (net == &init_net) + nlm_grace_period = gracetime; + } + + attr = info->attrs[LOCKD_A_SERVER_TCP_PORT]; + if (attr) { + ln->tcp_port = nla_get_u16(attr); + if (net == &init_net) + nlm_tcpport = ln->tcp_port; + } + + attr = info->attrs[LOCKD_A_SERVER_UDP_PORT]; + if (attr) { + ln->udp_port = nla_get_u16(attr); + if (net == &init_net) + nlm_udpport = ln->udp_port; + } + } + return 0; +} + +/** + * lockd_nl_server_get_doit - get lockd server parameters via netlink + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct lockd_net *ln = net_generic(net, lockd_net_id); + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + err = nla_put_u32(skb, LOCKD_A_SERVER_GRACETIME, ln->gracetime) || + nla_put_u16(skb, LOCKD_A_SERVER_TCP_PORT, ln->tcp_port) || + nla_put_u16(skb, LOCKD_A_SERVER_UDP_PORT, ln->udp_port); + if (err) + goto err_free_msg; + + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); +err_free_msg: + nlmsg_free(skb); + + return err; +} diff --git a/fs/nfs/export.c b/fs/nfs/export.c index be686b8e0c54..e9c233b6fd20 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -154,5 +154,6 @@ const struct export_operations nfs_export_ops = { EXPORT_OP_CLOSE_BEFORE_UNLINK | EXPORT_OP_REMOTE_FS | EXPORT_OP_NOATOMIC_ATTR | - EXPORT_OP_FLUSH_ON_CLOSE, + EXPORT_OP_FLUSH_ON_CLOSE | + EXPORT_OP_NOLOCKS, }; diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index ea382b75b26c..e2eaac14fd8e 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -42,7 +42,7 @@ struct nfsacl_encode_desc { }; struct nfsacl_simple_acl { - struct posix_acl acl; + struct posix_acl_hdr acl; struct posix_acl_entry ace[4]; }; @@ -112,7 +112,8 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, xdr_encode_word(buf, base, entries)) return -EINVAL; if (encode_entries && acl && acl->a_count == 3) { - struct posix_acl *acl2 = &aclbuf.acl; + struct posix_acl *acl2 = + container_of(&aclbuf.acl, struct posix_acl, hdr); /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is * invoked in contexts where a memory allocation failure is @@ -177,7 +178,8 @@ bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode, return false; if (encode_entries && acl && acl->a_count == 3) { - struct posix_acl *acl2 = &aclbuf.acl; + struct posix_acl *acl2 = + container_of(&aclbuf.acl, struct posix_acl, hdr); /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is * invoked in contexts where a memory allocation failure is diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index c0bd1509ccd4..792d3fed1b45 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -172,6 +172,16 @@ config NFSD_LEGACY_CLIENT_TRACKING recoverydir, or spawn a process directly using a usermodehelper upcall. - These legacy client tracking methods have proven to be probelmatic + These legacy client tracking methods have proven to be problematic and will be removed in the future. Say Y here if you need support for them in the interim. + +config NFSD_V4_DELEG_TIMESTAMPS + bool "Support delegated timestamps" + depends on NFSD_V4 + default n + help + NFSD implements delegated timestamps according to + draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This + is currently an experimental feature and is therefore left disabled + by default. diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fb9b1656a287..ab85e6a2454f 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -319,15 +319,14 @@ nfsd_file_check_writeback(struct nfsd_file *nf) mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } - -static bool nfsd_file_lru_add(struct nfsd_file *nf) +static void nfsd_file_lru_add(struct nfsd_file *nf) { - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { + refcount_inc(&nf->nf_ref); + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) trace_nfsd_file_lru_add(nf); - return true; - } - return false; + else + WARN_ON(1); + nfsd_file_schedule_laundrette(); } static bool nfsd_file_lru_remove(struct nfsd_file *nf) @@ -363,30 +362,10 @@ nfsd_file_put(struct nfsd_file *nf) if (test_bit(NFSD_FILE_GC, &nf->nf_flags) && test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - /* - * If this is the last reference (nf_ref == 1), then try to - * transfer it to the LRU. - */ - if (refcount_dec_not_one(&nf->nf_ref)) - return; - - /* Try to add it to the LRU. If that fails, decrement. */ - if (nfsd_file_lru_add(nf)) { - /* If it's still hashed, we're done */ - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - nfsd_file_schedule_laundrette(); - return; - } - - /* - * We're racing with unhashing, so try to remove it from - * the LRU. If removal fails, then someone else already - * has our reference. - */ - if (!nfsd_file_lru_remove(nf)) - return; - } + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + set_bit(NFSD_FILE_RECENT, &nf->nf_flags); } + if (refcount_dec_and_test(&nf->nf_ref)) nfsd_file_free(nf); } @@ -530,13 +509,12 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, } /* - * Put the reference held on behalf of the LRU. If it wasn't the last - * one, then just remove it from the LRU and ignore it. + * Put the reference held on behalf of the LRU if it is the last + * reference, else rotate. */ - if (!refcount_dec_and_test(&nf->nf_ref)) { + if (!refcount_dec_if_one(&nf->nf_ref)) { trace_nfsd_file_gc_in_use(nf); - list_lru_isolate(lru, &nf->nf_lru); - return LRU_REMOVED; + return LRU_ROTATE; } /* Refcount went to zero. Unhash it and queue it to the dispose list */ @@ -548,14 +526,54 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, return LRU_REMOVED; } +static enum lru_status +nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru, + void *arg) +{ + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + if (test_and_clear_bit(NFSD_FILE_RECENT, &nf->nf_flags)) { + /* + * "REFERENCED" really means "should be at the end of the + * LRU. As we are putting it there we can clear the flag. + */ + clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + trace_nfsd_file_gc_aged(nf); + return LRU_ROTATE; + } + return nfsd_file_lru_cb(item, lru, arg); +} + +/* If the shrinker runs between calls to list_lru_walk_node() in + * nfsd_file_gc(), the "remaining" count will be wrong. This could + * result in premature freeing of some files. This may not matter much + * but is easy to fix with this spinlock which temporarily disables + * the shrinker. + */ +static DEFINE_SPINLOCK(nfsd_gc_lock); static void nfsd_file_gc(void) { + unsigned long ret = 0; LIST_HEAD(dispose); - unsigned long ret; + int nid; + + spin_lock(&nfsd_gc_lock); + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid); + + while (remaining > 0) { + unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH); - ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, - &dispose, list_lru_count(&nfsd_file_lru)); + remaining -= nr; + ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, + &dispose, &nr); + if (nr) + /* walk aborted early */ + remaining = 0; + } + } + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); } @@ -563,9 +581,9 @@ nfsd_file_gc(void) static void nfsd_file_gc_worker(struct work_struct *work) { - nfsd_file_gc(); if (list_lru_count(&nfsd_file_lru)) - nfsd_file_schedule_laundrette(); + nfsd_file_gc(); + nfsd_file_schedule_laundrette(); } static unsigned long @@ -580,8 +598,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) LIST_HEAD(dispose); unsigned long ret; + if (!spin_trylock(&nfsd_gc_lock)) + return SHRINK_STOP; + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &dispose); + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); return ret; @@ -686,17 +708,12 @@ nfsd_file_close_inode(struct inode *inode) void nfsd_file_close_inode_sync(struct inode *inode) { - struct nfsd_file *nf; LIST_HEAD(dispose); trace_nfsd_file_close(inode); nfsd_file_queue_for_close(inode, &dispose); - while (!list_empty(&dispose)) { - nf = list_first_entry(&dispose, struct nfsd_file, nf_gc); - list_del_init(&nf->nf_gc); - nfsd_file_free(nf); - } + nfsd_file_dispose_list(&dispose); } static int @@ -1058,16 +1075,8 @@ retry: nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); rcu_read_unlock(); - if (nf) { - /* - * If the nf is on the LRU then it holds an extra reference - * that must be put if it's removed. It had better not be - * the last one however, since we should hold another. - */ - if (nfsd_file_lru_remove(nf)) - refcount_dec(&nf->nf_ref); + if (nf) goto wait_for_construction; - } new = nfsd_file_alloc(net, inode, need, want_gc); if (!new) { @@ -1161,6 +1170,9 @@ open_file: */ if (status != nfs_ok || inode->i_nlink == 0) nfsd_file_unhash(nf); + else if (want_gc) + nfsd_file_lru_add(nf); + clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); if (status == nfs_ok) goto out; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index d5db6b34ba30..5865f9c72712 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -4,6 +4,12 @@ #include <linux/fsnotify_backend.h> /* + * Limit the time that the list_lru_one lock is held during + * an LRU scan. + */ +#define NFSD_FILE_GC_BATCH (16UL) + +/* * This is the fsnotify_mark container that nfsd attaches to the files that it * is holding open. Note that we have a separate refcount here aside from the * one in the fsnotify_mark. We only want a single fsnotify_mark attached to @@ -38,6 +44,7 @@ struct nfsd_file { #define NFSD_FILE_PENDING (1) #define NFSD_FILE_REFERENCED (2) #define NFSD_FILE_GC (3) +#define NFSD_FILE_RECENT (4) unsigned long nf_flags; refcount_t nf_ref; unsigned char nf_may; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 484077200c5d..ec6539cec0fe 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -46,8 +46,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *clp); - #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -101,15 +99,15 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, if (bitmap[0] & FATTR4_WORD0_CHANGE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (bitmap[0] & FATTR4_WORD0_SIZE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) { fattr4_time_deleg_access access; if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access)) - return -NFSERR_BAD_XDR; + return -EIO; fattr->ncf_cb_atime.tv_sec = access.seconds; fattr->ncf_cb_atime.tv_nsec = access.nseconds; @@ -118,7 +116,7 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, fattr4_time_deleg_modify modify; if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify)) - return -NFSERR_BAD_XDR; + return -EIO; fattr->ncf_cb_mtime.tv_sec = modify.seconds; fattr->ncf_cb_mtime.tv_nsec = modify.nseconds; @@ -682,15 +680,15 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (xdr_stream_decode_u32(xdr, &attrlen) < 0) - return -NFSERR_BAD_XDR; + return -EIO; maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize); if (bitmap[2] != 0) maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) + sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2; if (attrlen > maxlen) - return -NFSERR_BAD_XDR; + return -EIO; status = decode_cb_fattr4(xdr, bitmap, ncf); return status; } @@ -1064,6 +1062,17 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb) return queue_work(clp->cl_callback_wq, &cb->cb_work); } +static void nfsd4_requeue_cb(struct rpc_task *task, struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); + task->tk_status = 0; + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + } +} + static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) { atomic_inc(&clp->cl_cb_inflight); @@ -1301,6 +1310,11 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); + if (test_bit(NFSD4_CALLBACK_WAKE, &cb->cb_flags)) + clear_and_wake_up_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + else + clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); nfsd41_cb_inflight_end(clp); @@ -1328,30 +1342,14 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } +/* Returns true if CB_COMPOUND processing should continue */ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { - struct nfs4_client *clp = cb->cb_clp; - struct nfsd4_session *session = clp->cl_cb_session; - bool ret = true; - - if (!clp->cl_minorversion) { - /* - * If the backchannel connection was shut down while this - * task was queued, we need to resubmit it after setting up - * a new backchannel connection. - * - * Note that if we lost our callback connection permanently - * the submission code will error out, so we don't need to - * handle that case here. - */ - if (RPC_SIGNALLED(task)) - goto need_restart; - - return true; - } + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + bool ret = false; if (cb->cb_held_slot < 0) - goto need_restart; + goto requeue; /* This is the operation status code for CB_SEQUENCE */ trace_nfsd_cb_seq_status(task, cb); @@ -1365,11 +1363,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback * (sequence ID, cached reply) MUST NOT change. */ ++session->se_cb_seq_nr[cb->cb_held_slot]; + ret = true; break; case -ESERVERFAULT: - ++session->se_cb_seq_nr[cb->cb_held_slot]; + /* + * Call succeeded, but the session, slot index, or slot + * sequence number in the response do not match the same + * in the server's call. The sequence information is thus + * untrustworthy. + */ nfsd4_mark_cb_fault(cb->cb_clp); - ret = false; break; case 1: /* @@ -1381,43 +1384,42 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback fallthrough; case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp); - ret = false; - goto need_restart; + goto requeue; case -NFS4ERR_DELAY: cb->cb_seq_status = 1; - if (!rpc_restart_call(task)) - goto out; - + if (RPC_SIGNALLED(task) || !rpc_restart_call(task)) + goto requeue; rpc_delay(task, 2 * HZ); return false; + case -NFS4ERR_SEQ_MISORDERED: case -NFS4ERR_BADSLOT: + /* + * A SEQ_MISORDERED or BADSLOT error means that the client and + * server are out of sync as to the backchannel parameters. Mark + * the backchannel faulty and restart the RPC, but leak the slot + * so that it's no longer used. + */ + nfsd4_mark_cb_fault(cb->cb_clp); + cb->cb_held_slot = -1; goto retry_nowait; - case -NFS4ERR_SEQ_MISORDERED: - if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) { - session->se_cb_seq_nr[cb->cb_held_slot] = 1; - goto retry_nowait; - } - break; default: nfsd4_mark_cb_fault(cb->cb_clp); } trace_nfsd_cb_free_slot(task, cb); nfsd41_cb_release_slot(cb); - - if (RPC_SIGNALLED(task)) - goto need_restart; -out: return ret; retry_nowait: - if (rpc_restart_call_prepare(task)) - ret = false; - goto out; -need_restart: - if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { - trace_nfsd_cb_restart(clp, cb); - task->tk_status = 0; - cb->cb_need_restart = true; + /* + * RPC_SIGNALLED() means that the rpc_client is being torn down and + * (possibly) recreated. Requeue the call in that case. + */ + if (!RPC_SIGNALLED(task)) { + if (rpc_restart_call_prepare(task)) + return false; } +requeue: + nfsd41_cb_release_slot(cb); + nfsd4_requeue_cb(task, cb); return false; } @@ -1428,8 +1430,21 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) trace_nfsd_cb_rpc_done(clp); - if (!nfsd4_cb_sequence_done(task, cb)) + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (RPC_SIGNALLED(task)) + nfsd4_requeue_cb(task, cb); + } else if (!nfsd4_cb_sequence_done(task, cb)) { return; + } if (cb->cb_status) { WARN_ONCE(task->tk_status, @@ -1462,7 +1477,7 @@ static void nfsd4_cb_release(void *calldata) trace_nfsd_cb_rpc_release(cb->cb_clp); - if (cb->cb_need_restart) + if (test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) nfsd4_queue_cb(cb); else nfsd41_destroy_cb(cb); @@ -1575,7 +1590,7 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - int flags; + int flags, ret; trace_nfsd_cb_start(clp); @@ -1601,16 +1616,19 @@ nfsd4_run_cb_work(struct work_struct *work) return; } - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { + if (!test_and_clear_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) { if (cb->cb_ops && cb->cb_ops->prepare) cb->cb_ops->prepare(cb); } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, - cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + ret = rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + if (ret != 0) { + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + nfsd4_queue_cb(cb); + } } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, @@ -1620,10 +1638,10 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; + cb->cb_flags = 0; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; - cb->cb_need_restart = false; cb->cb_held_slot = -1; } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index fbfddd3c4c94..290271ac4245 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -344,9 +344,10 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); - refcount_inc(&ls->ls_stid.sc_count); - nfsd4_run_cb(&ls->ls_recall); - + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ls->ls_recall.cb_flags)) { + refcount_inc(&ls->ls_stid.sc_count); + nfsd4_run_cb(&ls->ls_recall); + } out_unlock: spin_unlock(&ls->ls_lock); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f6e06c779d09..b397246dae7b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1847,7 +1847,7 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) NFSPROC4_CLNT_CB_OFFLOAD); trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, &cbo->co_fh, copy->cp_count, copy->nfserr); - nfsd4_run_cb(&cbo->co_cb); + nfsd4_try_run_cb(&cbo->co_cb); } /** diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 153eeea2c7c9..2041268b398a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -946,15 +946,6 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla spin_lock_init(&stid->sc_lock); INIT_LIST_HEAD(&stid->sc_cp_list); - /* - * It shouldn't be a problem to reuse an opaque stateid value. - * I don't think it is for 4.1. But with 4.0 I worry that, for - * example, a stray write retransmission could be accepted by - * the server when it should have been rejected. Therefore, - * adopt a trick from the sctp code to attempt to maximize the - * amount of time until an id is reused, by ensuring they always - * "increase" (mod INT_MAX): - */ return stid; out_free: kmem_cache_free(slab, stid); @@ -1050,6 +1041,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) return openlockstateid(stid); } +/* + * As the sc_free callback of deleg, this may be called by nfs4_put_stid + * in nfsd_break_one_deleg. + * Considering nfsd_break_one_deleg is called with the flc->flc_lock held, + * this function mustn't ever sleep. + */ static void nfs4_free_deleg(struct nfs4_stid *stid) { struct nfs4_delegation *dp = delegstateid(stid); @@ -1378,7 +1375,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); - WARN_ON_ONCE(!(dp->dl_stid.sc_status & + WARN_ON_ONCE(dp->dl_stid.sc_client->cl_minorversion > 0 && + !(dp->dl_stid.sc_status & (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); @@ -3168,7 +3166,6 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; - clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); drop_client(clp); } @@ -3199,7 +3196,6 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); nfs4_put_stid(&dp->dl_stid); } @@ -3220,11 +3216,15 @@ static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncf->ncf_getattr.cb_flags)) return; + /* set to proper status when nfsd4_cb_getattr_done runs */ ncf->ncf_cb_status = NFS4ERR_IO; + /* ensure that wake_bit is done when RUNNING is cleared */ + set_bit(NFSD4_CALLBACK_WAKE, &ncf->ncf_getattr.cb_flags); + refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&ncf->ncf_getattr); } @@ -4815,8 +4815,8 @@ out: static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - int count; struct nfsd_net *nn = shrink->private_data; + long count; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -5414,6 +5414,11 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + bool queued; + + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags)) + return; + /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease @@ -5422,7 +5427,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); + queued = nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!queued); + if (!queued) + nfs4_put_stid(&dp->dl_stid); } /* Called from break_lease() with flc_lock held. */ @@ -5948,11 +5956,23 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) return 0; } +#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; +} +#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return false; +} +#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */ + static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) { - bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; + bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; @@ -5999,6 +6019,15 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (!nf) return ERR_PTR(-EAGAIN); + /* + * File delegations and associated locks cannot be recovered if the + * export is from an NFS proxy server. + */ + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + nfsd_file_put(nf); + return ERR_PTR(-EOPNOTSUPP); + } + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -6151,8 +6180,8 @@ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *currentfh) { - bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; struct nfs4_openowner *oo = openowner(stp->st_stateowner); + bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; struct svc_fh *parent = NULL; struct nfs4_delegation *dp; @@ -6855,38 +6884,34 @@ deleg_reaper(struct nfsd_net *nn) { struct list_head *pos, *next; struct nfs4_client *clp; - LIST_HEAD(cblist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (clp->cl_state != NFSD4_ACTIVE || - list_empty(&clp->cl_delegations) || - atomic_read(&clp->cl_delegs_in_recall) || - test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || - (ktime_get_boottime_seconds() - - clp->cl_ra_time < 5)) { + + if (clp->cl_state != NFSD4_ACTIVE) + continue; + if (list_empty(&clp->cl_delegations)) + continue; + if (atomic_read(&clp->cl_delegs_in_recall)) + continue; + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &clp->cl_ra->ra_cb.cb_flags)) + continue; + if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5) + continue; + if (clp->cl_cb_state != NFSD4_CB_UP) continue; - } - list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ kref_get(&clp->cl_nfsdfs.cl_ref); - set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); - } - spin_unlock(&nn->client_lock); - - while (!list_empty(&cblist)) { - clp = list_first_entry(&cblist, struct nfs4_client, - cl_ra_cblist); - list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } + spin_unlock(&nn->client_lock); } static void @@ -7051,7 +7076,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, */ statusmask |= SC_STATUS_REVOKED; - statusmask |= SC_STATUS_ADMIN_REVOKED; + statusmask |= SC_STATUS_ADMIN_REVOKED | SC_STATUS_FREEABLE; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) @@ -7706,9 +7731,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, - SC_STATUS_REVOKED | SC_STATUS_FREEABLE, - &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7816,7 +7839,7 @@ nfsd4_lm_notify(struct file_lock *fl) if (queue) { trace_nfsd_cb_notify_lock(lo, nbl); - nfsd4_run_cb(&nbl->nbl_cb); + nfsd4_try_run_cb(&nbl->nbl_cb); } } @@ -8134,6 +8157,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status != nfs_ok) return status; + if (exportfs_cannot_lock(cstate->current_fh.fh_dentry->d_sb->s_export_op)) { + status = nfserr_notsupp; + goto out; + } if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -8473,6 +8500,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_lock_range; goto put_stateid; } + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + status = nfserr_notsupp; + goto put_file; + } + file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); @@ -9182,8 +9214,8 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); - wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, - TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); + wait_on_bit_timeout(&ncf->ncf_getattr.cb_flags, NFSD4_CALLBACK_RUNNING, + TASK_UNINTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); if (ncf->ncf_cb_status) { /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ce2a71e4904c..ac265d6fde35 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1917,6 +1917,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) struct svc_serv *serv; LIST_HEAD(permsocks); struct nfsd_net *nn; + bool delete = false; int err, rem; mutex_lock(&nfsd_mutex); @@ -1977,34 +1978,28 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) } } - /* For now, no removing old sockets while server is running */ - if (serv->sv_nrthreads && !list_empty(&permsocks)) { + /* + * If there are listener transports remaining on the permsocks list, + * it means we were asked to remove a listener. + */ + if (!list_empty(&permsocks)) { list_splice_init(&permsocks, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - err = -EBUSY; - goto out_unlock_mtx; + delete = true; } + spin_unlock_bh(&serv->sv_lock); - /* Close the remaining sockets on the permsocks list */ - while (!list_empty(&permsocks)) { - xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list); - list_move(&xprt->xpt_list, &serv->sv_permsocks); - - /* - * Newly-created sockets are born with the BUSY bit set. Clear - * it if there are no threads, since nothing can pick it up - * in that case. - */ - if (!serv->sv_nrthreads) - clear_bit(XPT_BUSY, &xprt->xpt_flags); - - set_bit(XPT_CLOSE, &xprt->xpt_flags); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_close(xprt); - spin_lock_bh(&serv->sv_lock); + /* Do not remove listeners while there are active threads. */ + if (serv->sv_nrthreads) { + err = -EBUSY; + goto out_unlock_mtx; } - spin_unlock_bh(&serv->sv_lock); + /* + * Since we can't delete an arbitrary llist entry, destroy the + * remaining listeners and recreate the list. + */ + if (delete) + svc_xprt_destroy_all(serv, net); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { @@ -2031,6 +2026,9 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) xprt = svc_find_listener(serv, xcl_name, net, sa); if (xprt) { + if (delete) + WARN_ONCE(1, "Transport type=%s already exists\n", + xcl_name); svc_xprt_put(xprt); continue; } @@ -2204,8 +2202,14 @@ static __net_init int nfsd_net_init(struct net *net) NFSD_STATS_COUNTERS_NUM); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); nn->nfsd_svcstats.program = &nfsd_programs[0]; + if (!nfsd_proc_stat_init(net)) { + retval = -ENOMEM; + goto out_proc_error; + } + for (i = 0; i < sizeof(nn->nfsd_versions); i++) nn->nfsd_versions[i] = nfsd_support_version(i); for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++) @@ -2215,13 +2219,14 @@ static __net_init int nfsd_net_init(struct net *net) nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); - nfsd_proc_stat_init(net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) spin_lock_init(&nn->local_clients_lock); INIT_LIST_HEAD(&nn->local_clients); #endif return 0; +out_proc_error: + percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); out_repcache_error: nfsd_idmap_shutdown(net); out_idmap_error: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 74d2d7b42676..290e29dd43eb 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -67,12 +67,15 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; +#define NFSD4_CALLBACK_RUNNING (0) +#define NFSD4_CALLBACK_WAKE (1) +#define NFSD4_CALLBACK_REQUEUE (2) + unsigned long cb_flags; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; int cb_held_slot; - bool cb_need_restart; }; struct nfsd4_callback_ops { @@ -162,15 +165,11 @@ struct nfs4_cb_fattr { struct timespec64 ncf_cb_mtime; struct timespec64 ncf_cb_atime; - unsigned long ncf_cb_flags; bool ncf_file_modified; u64 ncf_initial_cinfo; u64 ncf_cur_fsize; }; -/* bits for ncf_cb_flags */ -#define CB_GETATTR_BUSY 0 - /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -198,8 +197,8 @@ struct nfs4_delegation { struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ struct nfs4_clnt_odstate *dl_clnt_odstate; - u32 dl_type; time64_t dl_time; + u32 dl_type; /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; @@ -452,7 +451,6 @@ struct nfs4_client { #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) -#define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; struct workqueue_struct *cl_callback_wq; @@ -498,7 +496,6 @@ struct nfs4_client { struct nfsd4_cb_recall_any *cl_ra; time64_t cl_ra_time; - struct list_head cl_ra_cblist; }; /* struct nfs4_client_reset @@ -780,6 +777,13 @@ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn * extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern bool nfsd4_run_cb(struct nfsd4_callback *cb); + +static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb) +{ + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags)) + WARN_ON_ONCE(!nfsd4_run_cb(cb)); +} + extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); void nfsd4_async_copy_reaper(struct nfsd_net *nn); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index bb22893f1157..f7eaf95e20fc 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -73,11 +73,11 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -void nfsd_proc_stat_init(struct net *net) +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); + return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 04aacb6c36e2..e4efb0e4e56d 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,7 +10,7 @@ #include <uapi/linux/nfsd/stats.h> #include <linux/percpu_counter.h> -void nfsd_proc_stat_init(struct net *net); +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net); void nfsd_proc_stat_shutdown(struct net *net); static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index ad2c0c432d08..a7630e9f6577 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -803,6 +803,14 @@ DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \ DEFINE_CS_SLOT_EVENT(slot_seqid_conf); DEFINE_CS_SLOT_EVENT(slot_seqid_unconf); +#define show_nfs_slot_flags(val) \ + __print_flags(val, "|", \ + { NFSD4_SLOT_INUSE, "INUSE" }, \ + { NFSD4_SLOT_CACHETHIS, "CACHETHIS" }, \ + { NFSD4_SLOT_INITIALIZED, "INITIALIZED" }, \ + { NFSD4_SLOT_CACHED, "CACHED" }, \ + { NFSD4_SLOT_REUSED, "REUSED" }) + TRACE_EVENT(nfsd_slot_seqid_sequence, TP_PROTO( const struct nfs4_client *clp, @@ -813,10 +821,11 @@ TRACE_EVENT(nfsd_slot_seqid_sequence, TP_STRUCT__entry( __field(u32, seqid) __field(u32, slot_seqid) + __field(u32, slot_index) + __field(unsigned long, slot_flags) __field(u32, cl_boot) __field(u32, cl_id) __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) - __field(bool, in_use) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -825,11 +834,13 @@ TRACE_EVENT(nfsd_slot_seqid_sequence, clp->cl_cb_conn.cb_addrlen); __entry->seqid = seq->seqid; __entry->slot_seqid = slot->sl_seqid; + __entry->slot_index = seq->slotid; + __entry->slot_flags = slot->sl_flags; ), - TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u (%sin use)", + TP_printk("addr=%pISpc client %08x:%08x idx=%u seqid=%u slot_seqid=%u flags=%s", __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, - __entry->seqid, __entry->slot_seqid, - __entry->in_use ? "" : "not " + __entry->slot_index, __entry->seqid, __entry->slot_seqid, + show_nfs_slot_flags(__entry->slot_flags) ) ); @@ -1039,6 +1050,7 @@ DEFINE_CLID_EVENT(confirmed_r); { 1 << NFSD_FILE_HASHED, "HASHED" }, \ { 1 << NFSD_FILE_PENDING, "PENDING" }, \ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \ + { 1 << NFSD_FILE_RECENT, "RECENT" }, \ { 1 << NFSD_FILE_GC, "GC" }) DECLARE_EVENT_CLASS(nfsd_file_class, @@ -1317,6 +1329,7 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_aged); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, @@ -1346,6 +1359,7 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ TP_ARGS(removed, remaining)) DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_recent); DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); TRACE_EVENT(nfsd_file_close, @@ -1602,7 +1616,7 @@ DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, __entry->cl_id = clp->cl_clientid.cl_id; __entry->cb = cb; __entry->opcode = cb->cb_ops ? cb->cb_ops->opcode : _CB_NULL; - __entry->need_restart = cb->cb_need_restart; + __entry->need_restart = test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, clp->cl_cb_conn.cb_addrlen) ), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 34d7aa531662..9abdc4b75813 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -71,7 +71,6 @@ nfserrno (int errno) { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, { nfserr_nodev, -ENODEV }, { nfserr_notdir, -ENOTDIR }, { nfserr_isdir, -EISDIR }, @@ -1687,9 +1686,17 @@ out: return err; } -/* - * Create a hardlink - * N.B. After this call _both_ ffhp and tfhp need an fh_put +/** + * nfsd_link - create a link + * @rqstp: RPC transaction context + * @ffhp: the file handle of the directory where the new link is to be created + * @name: the filename of the new link + * @len: the length of @name in octets + * @tfhp: the file handle of an existing file object + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, @@ -1697,6 +1704,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, { struct dentry *ddir, *dnew, *dold; struct inode *dirp; + int type; __be32 err; int host_err; @@ -1716,11 +1724,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + err = nfs_ok; + type = d_inode(tfhp->fh_dentry)->i_mode & S_IFMT; host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); + if (host_err) goto out; - } ddir = ffhp->fh_dentry; dirp = d_inode(ddir); @@ -1728,7 +1736,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dnew = lookup_one_len(name, ddir, len); if (IS_ERR(dnew)) { - err = nfserrno(PTR_ERR(dnew)); + host_err = PTR_ERR(dnew); goto out_unlock; } @@ -1744,17 +1752,26 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, fh_fill_post_attrs(ffhp); inode_unlock(dirp); if (!host_err) { - err = nfserrno(commit_metadata(ffhp)); - if (!err) - err = nfserrno(commit_metadata(tfhp)); - } else { - err = nfserrno(host_err); + host_err = commit_metadata(ffhp); + if (!host_err) + host_err = commit_metadata(tfhp); } + dput(dnew); out_drop_write: fh_drop_write(tfhp); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.9.4 para 1-2: NFSv4 LINK + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); out_dput: dput(dnew); @@ -1783,9 +1800,19 @@ nfsd_has_cached_files(struct dentry *dentry) return ret; } -/* - * Rename a file - * N.B. After this call _both_ ffhp and tfhp need an fh_put +/** + * nfsd_rename - rename a directory entry + * @rqstp: RPC transaction context + * @ffhp: the file handle of parent directory containing the entry to be renamed + * @fname: the filename of directory entry to be renamed + * @flen: the length of @fname in octets + * @tfhp: the file handle of parent directory to contain the renamed entry + * @tname: the filename of the new entry + * @tlen: the length of @tlen in octets + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, @@ -1793,6 +1820,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, { struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; struct inode *fdir, *tdir; + int type = S_IFDIR; __be32 err; int host_err; bool close_cached = false; @@ -1850,11 +1878,14 @@ retry: host_err = -EINVAL; if (odentry == trap) goto out_dput_old; + type = d_inode(odentry)->i_mode & S_IFMT; ndentry = lookup_one_len(tname, tdentry, tlen); host_err = PTR_ERR(ndentry); if (IS_ERR(ndentry)) goto out_dput_old; + if (d_inode(ndentry)) + type = d_inode(ndentry)->i_mode & S_IFMT; host_err = -ENOTEMPTY; if (ndentry == trap) goto out_dput_new; @@ -1892,7 +1923,18 @@ retry: out_dput_old: dput(odentry); out_nfserr: - err = nfserrno(host_err); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } if (!close_cached) { fh_fill_post_attrs(ffhp); @@ -1919,9 +1961,17 @@ out: return err; } -/* - * Unlink a file or directory - * N.B. After this call fhp needs an fh_put +/** + * nfsd_unlink - remove a directory entry + * @rqstp: RPC transaction context + * @fhp: the file handle of the parent directory to be modified + * @type: enforced file type of the object to be removed + * @fname: the name of directory entry to be removed + * @flen: length of @fname in octets + * + * After this call fhp needs an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, @@ -1995,15 +2045,17 @@ out_drop_write: fh_drop_write(fhp); out_nfserr: if (host_err == -EBUSY) { - /* name is mounted-on. There is no perfect - * error status. + /* + * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE + * wants a status unique to the object type. */ - err = nfserr_file_open; - } else { - err = nfserrno(host_err); + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); out_unlock: inode_unlock(dirp); goto out_drop_write; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index edcca4beb765..b562d3dbc76b 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -197,18 +197,6 @@ int orangefs_bufmap_size_query(void) return size; } -int orangefs_bufmap_shift_query(void) -{ - struct orangefs_bufmap *bufmap; - int shift = 0; - spin_lock(&orangefs_bufmap_lock); - bufmap = __orangefs_bufmap; - if (bufmap) - shift = bufmap->desc_shift; - spin_unlock(&orangefs_bufmap_lock); - return shift; -} - static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); @@ -532,16 +520,3 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, } return 0; } - -void orangefs_bufmap_page_fill(void *page_to, - int buffer_index, - int slot_index) -{ - struct orangefs_bufmap_desc *from; - void *page_from; - - from = &__orangefs_bufmap->desc_array[buffer_index]; - page_from = kmap_atomic(from->page_array[slot_index]); - memcpy(page_to, page_from, PAGE_SIZE); - kunmap_atomic(page_from); -} diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h index 75b2d2833af1..4231175ccdb2 100644 --- a/fs/orangefs/orangefs-bufmap.h +++ b/fs/orangefs/orangefs-bufmap.h @@ -10,8 +10,6 @@ int orangefs_bufmap_size_query(void); -int orangefs_bufmap_shift_query(void); - int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc); void orangefs_bufmap_finalize(void); @@ -34,6 +32,5 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, int buffer_index, size_t size); -void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index); #endif /* __ORANGEFS_BUFMAP_H */ diff --git a/fs/proc/base.c b/fs/proc/base.c index cd89e956c322..5538c4aee8fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ @@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_hlist_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%px\n", - timer->sigq.info.si_signo, + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif @@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1cb33771bf9f..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -34,8 +34,6 @@ #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -122,7 +120,9 @@ static void update_kcore_size(void) kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); kcore_notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + @@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 034143869421..bb3b769edc71 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -266,7 +266,7 @@ static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80c..557cf9d40177 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 64bd68f750f8..63b3b1290bed 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -811,7 +811,23 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, return; for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *) (acl_base + acl_size); + acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif @@ -855,7 +871,6 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, (void *)ppace[i], sizeof(struct smb_ace)); */ - acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); } @@ -1550,7 +1565,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); @@ -1604,7 +1619,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, struct tcon_link *tlink; struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; bool posix; tlink = cifs_sb_tlink(cifs_sb); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 6a3bd652d251..a08c42363ffc 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -637,6 +637,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_sb->ctx->dir_mode); if (cifs_sb->ctx->iocharset) seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset); + if (tcon->ses->unicode == 0) + seq_puts(s, ",nounicode"); + else if (tcon->ses->unicode == 1) + seq_puts(s, ",unicode"); if (tcon->seal) seq_puts(s, ",seal"); else if (tcon->ses->server->ignore_signature) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index cddeb2adbf4a..6ae170a2a042 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -653,6 +653,7 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; + unsigned int cap_unicode; __u16 signing_enabled; __u16 signing_required; size_t create_lease_size; @@ -1120,6 +1121,7 @@ struct cifs_ses { bool sign; /* is signing required? */ bool domainAuto:1; bool expired_pwd; /* track if access denied or expired pwd so can know if need to update */ + int unicode; unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 81680001944d..cfcc07905bdf 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -31,6 +31,9 @@ extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); +extern int smb_send_kvec(struct TCP_Server_Info *server, + struct msghdr *msg, + size_t *sent); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); #define get_xid() \ @@ -592,7 +595,6 @@ int cifs_async_readv(struct cifs_io_subrequest *rdata); int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); void cifs_async_writev(struct cifs_io_subrequest *wdata); -void cifs_writev_complete(struct work_struct *work); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 4fc9485c5d91..29dcb88392e5 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -437,7 +437,10 @@ CIFSSMBNegotiate(const unsigned int xid, return rc; pSMB->hdr.Mid = get_next_mid(server); - pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); + pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; + + if (ses->unicode != 0) + pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; if (should_set_ext_sec_flag(ses->sectype)) { cifs_dbg(FYI, "Requesting extended security\n"); @@ -2709,6 +2712,9 @@ int cifs_query_reparse_point(const unsigned int xid, if (cap_unix(tcon->ses)) return -EOPNOTSUPP; + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return -EOPNOTSUPP; + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -3400,8 +3406,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, /* BB TEST with big acls that might need to be e.g. larger than 16K */ pSMB->MaxSetupCount = 0; pSMB->Fid = fid; /* file handle always le */ - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | - CIFS_ACL_DACL | info); + pSMB->AclFlags = cpu_to_le32(info); pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ inc_rfc1001_len(pSMB, 11); iov[0].iov_base = (char *)pSMB; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 73f93a35eedd..d7bad2c3af37 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -300,6 +300,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; + put_net(cifs_net_ns(server)); } server->sequence_number = 0; server->session_estab = false; @@ -1676,6 +1677,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); + tcp_ses->sign = ctx->sign; tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs; @@ -2349,6 +2351,7 @@ retry_old_session: ses->cred_uid = ctx->cred_uid; ses->linux_uid = ctx->linux_uid; + ses->unicode = ctx->unicode; ses->sectype = ctx->sectype; ses->sign = ctx->sign; @@ -3026,6 +3029,44 @@ bind_socket(struct TCP_Server_Info *server) } static int +smb_recv_kvec(struct TCP_Server_Info *server, struct msghdr *msg, size_t *recv) +{ + int rc = 0; + int retries = 0; + int msg_flags = server->noblocksnd ? MSG_DONTWAIT : 0; + + *recv = 0; + + while (msg_data_left(msg)) { + rc = sock_recvmsg(server->ssocket, msg, msg_flags); + if (rc == -EAGAIN) { + retries++; + if (retries >= 14 || + (!server->noblocksnd && (retries > 2))) { + cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + server->ssocket); + return -EAGAIN; + } + msleep(1 << retries); + continue; + } + + if (rc < 0) + return rc; + + if (rc == 0) { + cifs_dbg(FYI, "Received no data (TCP RST)\n"); + return -ECONNABORTED; + } + + /* recv was at least partially successful */ + *recv += rc; + retries = 0; /* in case we get ENOSPC on the next send */ + } + return 0; +} + +static int ip_rfc1001_connect(struct TCP_Server_Info *server) { int rc = 0; @@ -3035,8 +3076,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet req = {}; - struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + struct rfc1002_session_packet resp = {}; + struct msghdr msg = {}; + struct kvec iov = {}; unsigned int len; + size_t sent; + size_t recv; req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); @@ -3065,19 +3110,118 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * As per rfc1002, @len must be the number of bytes that follows the * length field of a rfc1002 session request payload. */ - len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + len = sizeof(req.trailer.session_req); + req.type = RFC1002_SESSION_REQUEST; + req.flags = 0; + req.length = cpu_to_be16(len); + len += offsetof(typeof(req), trailer.session_req); + iov.iov_base = &req; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len); + rc = smb_send_kvec(server, &msg, &sent); + if (rc < 0 || len != sent) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; - smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); - rc = smb_send(server, smb_buf, len); /* * RFC1001 layer in at least one server requires very short break before * negprot presumably because not expecting negprot to follow so fast. - * This is a simple solution that works without complicating the code - * and causes no significant slowing down on mount for everyone else + * For example DOS SMB servers cannot process negprot if it was received + * before the server sent response for SESSION_REQUEST packet. So, wait + * for the response, read it and parse it as it can contain useful error + * information (e.g. specified server name was incorrect). For example + * even the latest Windows Server 2022 SMB1 server over port 139 send + * error if its server name was in SESSION_REQUEST packet incorrect. + * Nowadays usage of port 139 is not common, so waiting for reply here + * does not slowing down mounting of common case (over port 445). */ - usleep_range(1000, 2000); + len = offsetof(typeof(resp), trailer); + iov.iov_base = &resp; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + rc = smb_recv_kvec(server, &msg, &recv); + if (rc < 0 || recv != len) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; + + switch (resp.type) { + case RFC1002_POSITIVE_SESSION_RESPONSE: + if (be16_to_cpu(resp.length) != 0) { + cifs_dbg(VFS, "RFC 1002 positive session response but with invalid non-zero length %u\n", + be16_to_cpu(resp.length)); + return -EIO; + } + cifs_dbg(FYI, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: + /* Read RFC1002 response error code and convert it to errno in rc */ + len = sizeof(resp.trailer.neg_ses_resp_error_code); + iov.iov_base = &resp.trailer.neg_ses_resp_error_code; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + if (be16_to_cpu(resp.length) == len && + smb_recv_kvec(server, &msg, &recv) == 0 && + recv == len) { + cifs_dbg(VFS, "RFC 1002 negative session response with error 0x%x\n", + resp.trailer.neg_ses_resp_error_code); + switch (resp.trailer.neg_ses_resp_error_code) { + case RFC1002_NOT_LISTENING_CALLED: + /* server does not listen for specified server name */ + fallthrough; + case RFC1002_NOT_PRESENT: + /* server name is incorrect */ + rc = -ENOENT; + cifs_dbg(VFS, "Server rejected NetBIOS servername %.15s\n", + server->server_RFC1001_name[0] ? + server->server_RFC1001_name : + DEFAULT_CIFS_CALLED_NAME); + cifs_dbg(VFS, "Specify correct NetBIOS servername in source path or with -o servern= option\n"); + break; + case RFC1002_NOT_LISTENING_CALLING: + /* client name was not accepted by server */ + rc = -EACCES; + cifs_dbg(VFS, "Server rejected NetBIOS clientname %.15s\n", + server->workstation_RFC1001_name[0] ? + server->workstation_RFC1001_name : + "LINUX_CIFS_CLNT"); + cifs_dbg(VFS, "Specify correct NetBIOS clientname with -o netbiosname= option\n"); + break; + case RFC1002_INSUFFICIENT_RESOURCE: + /* remote server resource error */ + rc = -EREMOTEIO; + break; + case RFC1002_UNSPECIFIED_ERROR: + default: + /* other/unknown error */ + rc = -EIO; + break; + } + } else { + cifs_dbg(VFS, "RFC 1002 negative session response\n"); + rc = -EIO; + } + return rc; + case RFC1002_RETARGET_SESSION_RESPONSE: + cifs_dbg(VFS, "RFC 1002 retarget session response\n"); + if (be16_to_cpu(resp.length) == sizeof(resp.trailer.retarget_resp)) { + len = sizeof(resp.trailer.retarget_resp); + iov.iov_base = &resp.trailer.retarget_resp; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + if (smb_recv_kvec(server, &msg, &recv) == 0 && recv == len) { + cifs_dbg(VFS, "Server wants to redirect connection\n"); + cifs_dbg(VFS, "Remount with options -o ip=%pI4,port=%u\n", + &resp.trailer.retarget_resp.retarget_ip_addr, + be16_to_cpu(resp.trailer.retarget_resp.port)); + } + } + cifs_dbg(VFS, "Closing connection\n"); + /* FIXME: Should we automatically redirect to new retarget_resp server? */ + return -EMULTIHOP; + default: + cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", resp.type); + return -EIO; + } - return rc; + return 0; } static int @@ -3123,8 +3267,12 @@ generic_ip_connect(struct TCP_Server_Info *server) /* * Grab netns reference for the socket. * - * It'll be released here, on error, or in clean_demultiplex_info() upon server - * teardown. + * This reference will be released in several situations: + * - In the failure path before the cifsd thread is started. + * - In the all place where server->socket is released, it is + * also set to NULL. + * - Ultimately in clean_demultiplex_info(), during the final + * teardown. */ get_net(net); @@ -3140,10 +3288,8 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) { - put_net(cifs_net_ns(server)); + if (rc < 0) return rc; - } /* * Eventually check for other socket options to change from @@ -3189,9 +3335,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); - if (rc < 0) - put_net(cifs_net_ns(server)); - return rc; } @@ -3981,7 +4124,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server, struct nls_table *nls_info) { - int rc = -ENOSYS; + int rc = 0; struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; @@ -4033,6 +4176,26 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (!linuxExtEnabled) ses->capabilities &= (~server->vals->cap_unix); + /* + * Check if the server supports specified encoding mode. + * Zero value in vals->cap_unicode indidcates that chosen + * protocol dialect does not support non-UNICODE mode. + */ + if (ses->unicode == 1 && server->vals->cap_unicode != 0 && + !(server->capabilities & server->vals->cap_unicode)) { + cifs_dbg(VFS, "Server does not support mounting in UNICODE mode\n"); + rc = -EOPNOTSUPP; + } else if (ses->unicode == 0 && server->vals->cap_unicode == 0) { + cifs_dbg(VFS, "Server does not support mounting in non-UNICODE mode\n"); + rc = -EOPNOTSUPP; + } else if (ses->unicode == 0) { + /* + * When UNICODE mode was explicitly disabled then + * do not announce client UNICODE capability. + */ + ses->capabilities &= (~server->vals->cap_unicode); + } + if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); @@ -4045,8 +4208,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); - if (server->ops->sess_setup) - rc = server->ops->sess_setup(xid, ses, server, nls_info); + if (!rc) { + if (server->ops->sess_setup) + rc = server->ops->sess_setup(xid, ses, server, nls_info); + else + rc = -ENOSYS; + } if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); @@ -4116,6 +4283,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->seal = master_tcon->seal; ctx->witness = master_tcon->use_witness; ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses; + ctx->unicode = master_tcon->ses->unicode; rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9e4f7378f30f..8407fb108664 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -3082,7 +3082,7 @@ void cifs_oplock_break(struct work_struct *work) cinode->oplock = 0; } - if (inode && S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->i_mode)) { if (CIFS_CACHE_READ(cinode)) break_lease(inode, O_RDONLY); else diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 8c73d4d60d1a..bdb762d398af 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -134,6 +134,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), fsparam_flag_no("nativesocket", Opt_nativesocket), + fsparam_flag_no("unicode", Opt_unicode), /* Mount options which take uid or gid */ fsparam_uid("backupuid", Opt_backupuid), @@ -963,6 +964,10 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } + if (new_ctx->unicode != old_ctx->unicode) { + cifs_errorf(fc, "can not change unicode during remount\n"); + return -EINVAL; + } return 0; } @@ -1118,6 +1123,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; + char *hostname; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); @@ -1443,6 +1449,16 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + hostname = extract_hostname(ctx->UNC); + if (IS_ERR(hostname)) { + cifs_errorf(fc, "Cannot extract hostname from UNC string\n"); + goto cifs_parse_mount_err; + } + /* last byte, type, is 0x20 for servr type */ + memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); + for (i = 0; i < RFC1001_NAME_LEN && hostname[i] != 0; i++) + ctx->target_rfc1001_name[i] = toupper(hostname[i]); + kfree(hostname); break; case Opt_user: kfree(ctx->username); @@ -1627,6 +1643,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->witness = true; pr_warn_once("Witness protocol support is experimental\n"); break; + case Opt_unicode: + ctx->unicode = !result.negated; + cifs_dbg(FYI, "unicode set to %d\n", ctx->unicode); + break; case Opt_rootfs: #ifndef CONFIG_CIFS_ROOT cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); @@ -1928,6 +1948,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; ctx->nonativesocket = 0; + ctx->unicode = -1; /* autodetect, but prefer UNICODE mode */ + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 881bfc08667e..42c6b66c2c1a 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -135,6 +135,7 @@ enum cifs_param { Opt_witness, Opt_is_upcall_target_mount, Opt_is_upcall_target_application, + Opt_unicode, /* Mount options which take numeric value */ Opt_backupuid, @@ -306,6 +307,7 @@ struct smb3_fs_context { bool compress; /* enable SMB2 messages (READ/WRITE) de/compression */ bool rootfs:1; /* if it's a SMB root file system */ bool witness:1; /* use witness protocol */ + int unicode; char *leaf_fullpath; struct cifs_ses *dfs_root_ses; bool dfs_automount:1; /* set for dfs automount only */ diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 6e6c09cc5ce7..a88253668286 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -643,7 +643,8 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, case CIFS_SYMLINK_TYPE_NATIVE: case CIFS_SYMLINK_TYPE_NFS: case CIFS_SYMLINK_TYPE_WSL: - if (server->ops->create_reparse_symlink) { + if (server->ops->create_reparse_symlink && + (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) { rc = server->ops->create_reparse_symlink(xid, inode, direntry, pTcon, diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index faa80e7d54a6..f2ca5963cd9d 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -242,7 +242,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) iface->num_channels++; iface->weight_fulfilled++; - cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n", + cifs_info("successfully opened new channel on iface:%pIS\n", &iface->sockaddr); break; } @@ -501,6 +501,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->password = ses->password; ctx->sectype = ses->sectype; ctx->sign = ses->sign; + ctx->unicode = ses->unicode; /* UNC and paths */ /* XXX: Use ses->server->hostname? */ @@ -522,6 +523,13 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay; ctx->echo_interval = ses->server->echo_interval / HZ; ctx->max_credits = ses->server->max_credits; + ctx->min_offload = ses->server->min_offload; + ctx->compress = ses->server->compression.requested; + ctx->dfs_conn = ses->server->dfs_conn; + ctx->ignore_signature = ses->server->ignore_signature; + ctx->leaf_fullpath = ses->server->leaf_fullpath; + ctx->rootfs = ses->server->noblockcnt; + ctx->retrans = ses->server->retrans; /* * This will be used for encoding/decoding user/domain/pw diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index d6e2fb669c40..8701484805cd 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1170,6 +1170,7 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, + .cap_unicode = CAP_UNICODE, .signing_enabled = SECMODE_SIGN_ENABLED, .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 826b57a5a2a8..e9fd3e204a6f 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1273,6 +1273,14 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, int rc; int i; + /* + * If server filesystem does not support reparse points then do not + * attempt to create reparse point. This will prevent creating unusable + * empty object on the server. + */ + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return ERR_PTR(-EOPNOTSUPP); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, SYNCHRONIZE | DELETE | FILE_READ_ATTRIBUTES | diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index f3c4b70b77b9..cddf273c14ae 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -816,11 +816,12 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative"); spin_unlock(&cifs_tcp_ses_lock); - if (tcon->ses) + if (tcon->ses) { server = tcon->ses->server; - - cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", - tcon->tid, persistent_fid, volatile_fid); + cifs_server_dbg(FYI, + "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", + tcon->tid, persistent_fid, volatile_fid); + } return 0; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4dd11eafb69d..a700e5921961 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -969,7 +969,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (islink) rc = -EREMOTE; } - if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb && + if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) rc = -EOPNOTSUPP; goto out; @@ -5229,7 +5229,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - int rc; + int rc = -EOPNOTSUPP; /* * Check if mounted with mount parm 'sfu' mount parm. @@ -5240,7 +5240,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); - } else { + } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { rc = smb2_mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index f9c521b3c65e..4f69a1825e42 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3916,12 +3916,10 @@ SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, void **data, u32 *plen, u32 extra_info) { - __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | - extra_info; *plen = 0; return query_info(xid, tcon, persistent_fid, volatile_fid, - 0, SMB2_O_INFO_SECURITY, additional_info, + 0, SMB2_O_INFO_SECURITY, extra_info, SMB2_MAX_BUFFER_SIZE, MIN_SEC_DESC_LEN, data, plen); } diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 0dc80959ce48..03434dbe9374 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -179,7 +179,7 @@ delete_mid(struct mid_q_entry *mid) * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ -static int +int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 58a584f0b27e..7d49f38f01f3 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -320,10 +320,17 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->get_acl == NULL) goto out; /* rc already EOPNOTSUPP */ - if (handler->flags == XATTR_CIFS_NTSD_FULL) { - extra_info = SACL_SECINFO; - } else { - extra_info = 0; + switch (handler->flags) { + case XATTR_CIFS_NTSD_FULL: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | SACL_SECINFO; + break; + case XATTR_CIFS_NTSD: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; + break; + case XATTR_CIFS_ACL: + default: + extra_info = DACL_SECINFO; + break; } pacl = pTcon->ses->server->ops->get_acl(cifs_sb, inode, full_path, &acllen, extra_info); diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index cabe6a843c6a..cf70e96ad4de 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -70,4 +70,4 @@ config SMB_SERVER_CHECK_CAP_NET_ADMIN config SMB_SERVER_KERBEROS5 bool "Support for Kerberos 5" depends on SMB_SERVER - default n + default y diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 2a5b4a96bf99..00b31cf86462 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -1218,7 +1218,7 @@ free_iv: free_sg: kfree(sg); free_req: - kfree(req); + aead_request_free(req); free_ctx: ksmbd_release_crypto_ctx(ctx); return rc; diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 71c6939dfbf1..53d308f331af 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -230,6 +230,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (!ksmbd_chann_del(conn, sess) && xa_empty(&sess->ksmbd_chann_list)) { hash_del(&sess->hlist); + down_write(&conn->session_lock); + xa_erase(&conn->sessions, sess->id); + up_write(&conn->session_lock); ksmbd_session_destroy(sess); } } @@ -256,6 +259,22 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) up_write(&sessions_table_lock); } +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct ksmbd_session *sess; + + down_read(&conn->session_lock); + sess = xa_load(&conn->sessions, id); + if (sess) { + up_read(&conn->session_lock); + return true; + } + up_read(&conn->session_lock); + + return false; +} + struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id) { diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index c1c4b20bd5c6..f21348381d59 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -87,6 +87,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess); struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id); struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id); +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id); int ksmbd_session_register(struct ksmbd_conn *conn, struct ksmbd_session *sess); void ksmbd_sessions_deregister(struct ksmbd_conn *conn); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 28886ff1ee57..f103b1bd0400 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -724,8 +724,8 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { - ksmbd_conn_r_count_inc(conn); INIT_WORK(&work->work, __smb2_oplock_break_noti); ksmbd_queue_work(work); @@ -833,8 +833,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { - ksmbd_conn_r_count_inc(conn); INIT_WORK(&work->work, __smb2_lease_break_noti); ksmbd_queue_work(work); wait_for_break_ack(opinfo); @@ -1505,6 +1505,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease_v2) - 4) + return NULL; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; @@ -1517,6 +1521,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) } else { struct create_lease *lc = (struct create_lease *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease)) + return NULL; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index c53121538990..4ddf4300371b 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1707,44 +1707,38 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->dialect != sess->dialect) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (strncmp(conn->ClientGUID, sess->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { rc = -ENOENT; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_IN_PROGRESS) { rc = -EACCES; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_EXPIRED) { rc = -EFAULT; - ksmbd_user_session_put(sess); goto out_err; } - ksmbd_user_session_put(sess); if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; + ksmbd_user_session_put(sess); sess = NULL; goto out_err; } - sess = ksmbd_session_lookup(conn, sess_id); - if (!sess) { + if (is_ksmbd_session_in_connection(conn, sess_id)) { rc = -EACCES; goto out_err; } @@ -1910,6 +1904,8 @@ out_err: sess->last_active = jiffies; sess->state = SMB2_SESSION_EXPIRED; + ksmbd_user_session_put(sess); + work->sess = NULL; if (try_delay) { ksmbd_conn_set_need_reconnect(conn); ssleep(5); @@ -2708,6 +2704,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_v2_req)) { + err = -EINVAL; + goto out; + } + recon_v2 = (struct create_durable_reconn_v2_req *)context; persistent_id = recon_v2->Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2741,6 +2744,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_req)) { + err = -EINVAL; + goto out; + } + recon = (struct create_durable_reconn_req *)context; persistent_id = recon->Data.Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2766,6 +2776,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_req_v2)) { + err = -EINVAL; + goto out; + } + durable_v2_blob = (struct create_durable_req_v2 *)context; ksmbd_debug(SMB, "Request for durable v2 open\n"); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c3785a5434f9..4998df04ab95 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -14,6 +14,7 @@ #include <linux/mempool.h> #include <linux/highmem.h> #include <linux/scatterlist.h> +#include <linux/string_choices.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> @@ -1396,7 +1397,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, } ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", - is_read ? "read" : "write", buf_len, credits_needed); + str_read_write(is_read), buf_len, credits_needed); ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) @@ -2241,38 +2242,16 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev) for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { struct net_device *ndev; - if (smb_dev->ib_dev->ops.get_netdev) { - ndev = smb_dev->ib_dev->ops.get_netdev( - smb_dev->ib_dev, i + 1); - if (!ndev) - continue; + ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); + if (!ndev) + continue; - if (ndev == netdev) { - dev_put(ndev); - rdma_capable = true; - goto out; - } + if (ndev == netdev) { dev_put(ndev); - /* if ib_dev does not implement ops.get_netdev - * check for matching infiniband GUID in hw_addr - */ - } else if (netdev->type == ARPHRD_INFINIBAND) { - struct netdev_hw_addr *ha; - union ib_gid gid; - u32 port_num; - int ret; - - netdev_hw_addr_list_for_each( - ha, &netdev->dev_addrs) { - memcpy(&gid, ha->addr + 4, sizeof(gid)); - ret = ib_find_gid(smb_dev->ib_dev, &gid, - &port_num, NULL); - if (!ret) { - rdma_capable = true; - goto out; - } - } + rdma_capable = true; + goto out; } + dev_put(ndev); } } out: @@ -2289,7 +2268,7 @@ out: } ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", - netdev->name, rdma_capable ? "true" : "false"); + netdev->name, str_true_false(rdma_capable)); return rdma_capable; } diff --git a/fs/timerfd.c b/fs/timerfd.c index 753e22e83e0f..c68f28d9c426 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { - hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); - ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { @@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else - hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 0b48cbab8a3d..ea6f06adcd43 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -15,7 +15,8 @@ * decompression. */ -#include <linux/crypto.h> +#include <crypto/acompress.h> +#include <linux/highmem.h> #include "ubifs.h" /* Fake description object for the "none" compressor */ @@ -26,11 +27,8 @@ static struct ubifs_compressor none_compr = { }; #ifdef CONFIG_UBIFS_FS_LZO -static DEFINE_MUTEX(lzo_mutex); - static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, - .comp_mutex = &lzo_mutex, .name = "lzo", .capi_name = "lzo", }; @@ -42,13 +40,8 @@ static struct ubifs_compressor lzo_compr = { #endif #ifdef CONFIG_UBIFS_FS_ZLIB -static DEFINE_MUTEX(deflate_mutex); -static DEFINE_MUTEX(inflate_mutex); - static struct ubifs_compressor zlib_compr = { .compr_type = UBIFS_COMPR_ZLIB, - .comp_mutex = &deflate_mutex, - .decomp_mutex = &inflate_mutex, .name = "zlib", .capi_name = "deflate", }; @@ -60,13 +53,8 @@ static struct ubifs_compressor zlib_compr = { #endif #ifdef CONFIG_UBIFS_FS_ZSTD -static DEFINE_MUTEX(zstd_enc_mutex); -static DEFINE_MUTEX(zstd_dec_mutex); - static struct ubifs_compressor zstd_compr = { .compr_type = UBIFS_COMPR_ZSTD, - .comp_mutex = &zstd_enc_mutex, - .decomp_mutex = &zstd_dec_mutex, .name = "zstd", .capi_name = "zstd", }; @@ -80,6 +68,30 @@ static struct ubifs_compressor zstd_compr = { /* All UBIFS compressors */ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; +static int ubifs_compress_req(const struct ubifs_info *c, + struct acomp_req *req, + void *out_buf, int *out_len, + const char *compr_name) +{ + struct crypto_wait wait; + int in_len = req->slen; + int dlen = *out_len; + int err; + + dlen = min(dlen, in_len - UBIFS_MIN_COMPRESS_DIFF); + + crypto_init_wait(&wait); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + acomp_request_set_dst_dma(req, out_buf, dlen); + err = crypto_acomp_compress(req); + err = crypto_wait_req(err, &wait); + *out_len = req->dlen; + acomp_request_free(req); + + return err; +} + /** * ubifs_compress - compress data. * @c: UBIFS file-system description object @@ -112,23 +124,14 @@ void ubifs_compress(const struct ubifs_info *c, const void *in_buf, if (in_len < UBIFS_MIN_COMPR_LEN) goto no_compr; - if (compr->comp_mutex) - mutex_lock(compr->comp_mutex); - err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, - (unsigned int *)out_len); - if (compr->comp_mutex) - mutex_unlock(compr->comp_mutex); - if (unlikely(err)) { - ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", - in_len, compr->name, err); - goto no_compr; + { + ACOMP_REQUEST_ALLOC(req, compr->cc, GFP_NOFS | __GFP_NOWARN); + + acomp_request_set_src_dma(req, in_buf, in_len); + err = ubifs_compress_req(c, req, out_buf, out_len, compr->name); } - /* - * If the data compressed only slightly, it is better to leave it - * uncompressed to improve read speed. - */ - if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) + if (err) goto no_compr; return; @@ -140,6 +143,83 @@ no_compr: } /** + * ubifs_compress_folio - compress folio. + * @c: UBIFS file-system description object + * @in_folio: data to compress + * @in_offset: offset into @in_folio + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit + * + * This function compresses input folio @in_folio of length @in_len and + * stores the result in the output buffer @out_buf and the resulting length + * in @out_len. If the input buffer does not compress, it is just copied + * to the @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE + * or if compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. + */ +void ubifs_compress_folio(const struct ubifs_info *c, struct folio *in_folio, + size_t in_offset, int in_len, void *out_buf, + int *out_len, int *compr_type) +{ + int err; + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + { + ACOMP_REQUEST_ALLOC(req, compr->cc, GFP_NOFS | __GFP_NOWARN); + + acomp_request_set_src_folio(req, in_folio, in_offset, in_len); + err = ubifs_compress_req(c, req, out_buf, out_len, compr->name); + } + + if (err) + goto no_compr; + + return; + +no_compr: + memcpy_from_folio(out_buf, in_folio, in_offset, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + +static int ubifs_decompress_req(const struct ubifs_info *c, + struct acomp_req *req, + const void *in_buf, int in_len, int *out_len, + const char *compr_name) +{ + struct crypto_wait wait; + int err; + + crypto_init_wait(&wait); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + acomp_request_set_src_dma(req, in_buf, in_len); + err = crypto_acomp_decompress(req); + err = crypto_wait_req(err, &wait); + *out_len = req->dlen; + + if (err) + ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d", + in_len, compr_name, err); + + acomp_request_free(req); + + return err; +} + +/** * ubifs_decompress - decompress data. * @c: UBIFS file-system description object * @in_buf: data to decompress @@ -155,7 +235,6 @@ no_compr: int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, int in_len, void *out_buf, int *out_len, int compr_type) { - int err; struct ubifs_compressor *compr; if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { @@ -176,17 +255,62 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, return 0; } - if (compr->decomp_mutex) - mutex_lock(compr->decomp_mutex); - err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, - (unsigned int *)out_len); - if (compr->decomp_mutex) - mutex_unlock(compr->decomp_mutex); - if (err) - ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d", - in_len, compr->name, err); + { + ACOMP_REQUEST_ALLOC(req, compr->cc, GFP_NOFS | __GFP_NOWARN); - return err; + acomp_request_set_dst_dma(req, out_buf, *out_len); + return ubifs_decompress_req(c, req, in_buf, in_len, out_len, + compr->name); + } +} + +/** + * ubifs_decompress_folio - decompress folio. + * @c: UBIFS file-system description object + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_folio: output folio where decompressed data should + * @out_offset: offset into @out_folio + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into folio + * @out_folio. The length of the uncompressed data is returned in + * @out_len. This functions returns %0 on success or a negative error + * code on failure. + */ +int ubifs_decompress_folio(const struct ubifs_info *c, const void *in_buf, + int in_len, struct folio *out_folio, + size_t out_offset, int *out_len, int compr_type) +{ + struct ubifs_compressor *compr; + + if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { + ubifs_err(c, "invalid compression type %d", compr_type); + return -EINVAL; + } + + compr = ubifs_compressors[compr_type]; + + if (unlikely(!compr->capi_name)) { + ubifs_err(c, "%s compression is not compiled in", compr->name); + return -EINVAL; + } + + if (compr_type == UBIFS_COMPR_NONE) { + memcpy_to_folio(out_folio, out_offset, in_buf, in_len); + *out_len = in_len; + return 0; + } + + { + ACOMP_REQUEST_ALLOC(req, compr->cc, GFP_NOFS | __GFP_NOWARN); + + acomp_request_set_dst_folio(req, out_folio, out_offset, + *out_len); + return ubifs_decompress_req(c, req, in_buf, in_len, out_len, + compr->name); + } } /** @@ -199,7 +323,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, static int __init compr_init(struct ubifs_compressor *compr) { if (compr->capi_name) { - compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + compr->cc = crypto_alloc_acomp(compr->capi_name, 0, 0); if (IS_ERR(compr->cc)) { pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld", current->pid, compr->name, PTR_ERR(compr->cc)); @@ -218,7 +342,7 @@ static int __init compr_init(struct ubifs_compressor *compr) static void compr_exit(struct ubifs_compressor *compr) { if (compr->capi_name) - crypto_free_comp(compr->cc); + crypto_free_acomp(compr->cc); } /** diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5130123005e4..bf311c38d9a8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -42,8 +42,8 @@ #include <linux/slab.h> #include <linux/migrate.h> -static int read_block(struct inode *inode, void *addr, unsigned int block, - struct ubifs_data_node *dn) +static int read_block(struct inode *inode, struct folio *folio, size_t offset, + unsigned int block, struct ubifs_data_node *dn) { struct ubifs_info *c = inode->i_sb->s_fs_info; int err, len, out_len; @@ -55,7 +55,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, if (err) { if (err == -ENOENT) /* Not found, so it must be a hole */ - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); return err; } @@ -74,8 +74,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, } out_len = UBIFS_BLOCK_SIZE; - err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, - le16_to_cpu(dn->compr_type)); + err = ubifs_decompress_folio(c, &dn->data, dlen, folio, offset, + &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto dump; @@ -85,7 +85,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, * appending data). Ensure that the remainder is zeroed out. */ if (len < UBIFS_BLOCK_SIZE) - memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + folio_zero_range(folio, offset + len, UBIFS_BLOCK_SIZE - len); return 0; @@ -98,27 +98,25 @@ dump: static int do_readpage(struct folio *folio) { - void *addr; int err = 0, i; unsigned int block, beyond; struct ubifs_data_node *dn = NULL; struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); + size_t offset = 0; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); - addr = kmap_local_folio(folio, 0); - block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ folio_set_checked(folio); - addr = folio_zero_tail(folio, 0, addr); + folio_zero_range(folio, 0, folio_size(folio)); goto out; } @@ -135,9 +133,9 @@ static int do_readpage(struct folio *folio) if (block >= beyond) { /* Reading beyond inode */ err = -ENOENT; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } else { - ret = read_block(inode, addr, block, dn); + ret = read_block(inode, folio, offset, block, dn); if (ret) { err = ret; if (err != -ENOENT) @@ -147,17 +145,13 @@ static int do_readpage(struct folio *folio) int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); if (ilen && ilen < dlen) - memset(addr + ilen, 0, dlen - ilen); + folio_zero_range(folio, offset + ilen, dlen - ilen); } } if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; - addr += UBIFS_BLOCK_SIZE; - if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { - kunmap_local(addr - UBIFS_BLOCK_SIZE); - addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); - } + offset += UBIFS_BLOCK_SIZE; } if (err) { @@ -177,8 +171,6 @@ out: kfree(dn); if (!err) folio_mark_uptodate(folio); - flush_dcache_folio(folio); - kunmap_local(addr); return err; } @@ -602,18 +594,16 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; - void *addr, *zaddr; + size_t offset = 0; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap_local_folio(folio, 0); - end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || folio->index > end_index) { hole = 1; - addr = folio_zero_tail(folio, 0, addr); + folio_zero_range(folio, 0, folio_size(folio)); goto out_hole; } @@ -623,7 +613,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, if (nn >= bu->cnt) { hole = 1; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { struct ubifs_data_node *dn; @@ -645,13 +635,15 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, goto out_err; } - err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, - le16_to_cpu(dn->compr_type)); + err = ubifs_decompress_folio( + c, &dn->data, dlen, folio, offset, &out_len, + le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto out_err; if (len < UBIFS_BLOCK_SIZE) - memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + folio_zero_range(folio, offset + len, + UBIFS_BLOCK_SIZE - len); nn += 1; read = (i << UBIFS_BLOCK_SHIFT) + len; @@ -660,23 +652,19 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, continue; } else { hole = 1; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } if (++i >= UBIFS_BLOCKS_PER_PAGE) break; - addr += UBIFS_BLOCK_SIZE; + offset += UBIFS_BLOCK_SIZE; page_block += 1; - if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { - kunmap_local(addr - UBIFS_BLOCK_SIZE); - addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); - } } if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) - memset(zaddr + len, 0, read - len); + folio_zero_range(folio, len, read - len); } out_hole: @@ -686,14 +674,10 @@ out_hole: } folio_mark_uptodate(folio); - flush_dcache_folio(folio); - kunmap_local(addr); *n = nn; return 0; out_err: - flush_dcache_folio(folio); - kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -898,7 +882,6 @@ static int do_writepage(struct folio *folio, size_t len) { int err = 0, blen; unsigned int block; - void *addr; size_t offset = 0; union ubifs_key key; struct inode *inode = folio->mapping->host; @@ -913,26 +896,19 @@ static int do_writepage(struct folio *folio, size_t len) folio_start_writeback(folio); - addr = kmap_local_folio(folio, offset); block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; for (;;) { blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); - err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + err = ubifs_jnl_write_data(c, inode, &key, folio, offset, blen); if (err) break; len -= blen; if (!len) break; block += 1; - addr += blen; - if (folio_test_highmem(folio) && !offset_in_page(addr)) { - kunmap_local(addr - blen); - offset += PAGE_SIZE; - addr = kmap_local_folio(folio, offset); - } + offset += blen; } - kunmap_local(addr); if (err) { mapping_set_error(folio->mapping, err); ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01d8eb170382..a79f229df475 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->c = c; wbuf->next_ino = 0; - hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wbuf->timer.function = wbuf_timer_callback_nolock; + hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 36ba79fbd2ff..ee954e64ce7f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -845,14 +845,16 @@ out_ro: * @c: UBIFS file-system description object * @inode: inode the data node belongs to * @key: node key - * @buf: buffer to write + * @folio: buffer to write + * @offset: offset to write at * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) * * This function writes a data node to the journal. Returns %0 if the data node * was successfully written, and a negative error code in case of failure. */ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, - const union ubifs_key *key, const void *buf, int len) + const union ubifs_key *key, struct folio *folio, + size_t offset, int len) { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len, compr_len, auth_len; @@ -896,7 +898,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, compr_type = ui->compr_type; out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ; - ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type); + ubifs_compress_folio(c, folio, offset, len, &data->data, &compr_len, + &compr_type); ubifs_assert(c, compr_len <= UBIFS_BLOCK_SIZE); if (encrypted) { @@ -1625,7 +1628,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); - buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); + buf = kmalloc(out_len, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3375bbe0508c..256dbaeeb0de 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -124,13 +124,6 @@ #define OLD_ZNODE_AGE 20 #define YOUNG_ZNODE_AGE 5 -/* - * Some compressors, like LZO, may end up with more data then the input buffer. - * So UBIFS always allocates larger output buffer, to be sure the compressor - * will not corrupt memory in case of worst case compression. - */ -#define WORST_COMPR_FACTOR 2 - #ifdef CONFIG_FS_ENCRYPTION #define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT #else @@ -141,7 +134,7 @@ * How much memory is needed for a buffer where we compress a data node. */ #define COMPRESSED_DATA_NODE_BUF_SZ \ - (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 @@ -270,6 +263,8 @@ enum { ASSACT_PANIC, }; +struct folio; + /** * struct ubifs_old_idx - index node obsoleted since last commit start. * @rb: rb-tree node @@ -835,16 +830,12 @@ struct ubifs_node_range { * struct ubifs_compressor - UBIFS compressor description structure. * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) * @cc: cryptoapi compressor handle - * @comp_mutex: mutex used during compression - * @decomp_mutex: mutex used during decompression * @name: compressor name * @capi_name: cryptoapi compressor name */ struct ubifs_compressor { int compr_type; - struct crypto_comp *cc; - struct mutex *comp_mutex; - struct mutex *decomp_mutex; + struct crypto_acomp *cc; const char *name; const char *capi_name; }; @@ -1795,7 +1786,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent, int in_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, - const union ubifs_key *key, const void *buf, int len); + const union ubifs_key *key, struct folio *folio, + size_t offset, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, @@ -2095,8 +2087,14 @@ int __init ubifs_compressors_init(void); void ubifs_compressors_exit(void); void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type); +void ubifs_compress_folio(const struct ubifs_info *c, struct folio *folio, + size_t offset, int in_len, void *out_buf, + int *out_len, int *compr_type); int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, void *out, int *out_len, int compr_type); +int ubifs_decompress_folio(const struct ubifs_info *c, const void *buf, + int len, struct folio *folio, size_t offset, + int *out_len, int compr_type); /* sysfs.c */ int ubifs_sysfs_init(void); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 70c907fe8af9..4386dd845e40 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -810,6 +810,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + ret = 0; goto out_free; } diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index da786a687fdc..4ad2c36550f1 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -10,6 +10,7 @@ config UNICODE be a separate loadable module that gets requested only when a file system actually use it. -config UNICODE_NORMALIZATION_SELFTEST +config UNICODE_NORMALIZATION_KUNIT_TEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE && KUNIT + default KUNIT_ALL_TESTS diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index e309afe2b2bb..d95be7fb9f6b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig new file mode 100644 index 000000000000..62dd5c171f9c --- /dev/null +++ b/fs/unicode/tests/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_UNICODE=y +CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c index 5ddaf27b21a6..5063e8138aec 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/tests/utf8_kunit.c @@ -1,34 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel module for testing utf-8 support. + * KUnit tests for utf-8 support. * * Copyright 2017 Collabora Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/printk.h> #include <linux/unicode.h> -#include <linux/dcache.h> - -#include "utf8n.h" - -static unsigned int failed_tests; -static unsigned int total_tests; - -#define _test(cond, func, line, fmt, ...) do { \ - total_tests++; \ - if (!cond) { \ - failed_tests++; \ - pr_err("test %s:%d Failed: %s%s", \ - func, line, #cond, (fmt?":":".")); \ - if (fmt) \ - pr_err(fmt, ##__VA_ARGS__); \ - } \ - } while (0) -#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define test(cond) _test(cond, __func__, __LINE__, "") +#include <kunit/test.h> + +#include "../utf8n.h" static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ @@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, return utf8ncursor(u8c, um, n, s, (unsigned int)-1); } -static void check_utf8_nfdi(struct unicode_map *um) +static void check_utf8_nfdi(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); int nlen = strlen(nfdi_test_data[i].dec); int j = 0; unsigned char c; + int ret; + + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), + nlen); - test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == - nlen)); - if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdi_test_data[i].dec[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdi_test_data[i].dec[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdi_test_data[i].dec[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_nfdicf(struct unicode_map *um) +static void check_utf8_nfdicf(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); int nlen = strlen(nfdicf_test_data[i].ncf); int j = 0; + int ret; unsigned char c; - test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == - nlen)); - test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == - nlen)); + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), + nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), + nlen); - if (utf8cursor(&u8c, um, UTF8_NFDICF, - nfdicf_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdicf_test_data[i].ncf[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdicf_test_data[i].ncf[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdicf_test_data[i].ncf[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_comparisons(struct unicode_map *table) +static void check_utf8_comparisons(struct kunit *test) { int i; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdi_test_data[i].dec, .len = sizeof(nfdi_test_data[i].dec)}; - test_f(!utf8_strncmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { @@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, .len = sizeof(nfdicf_test_data[i].ncf)}; - test_f(!utf8_strncasecmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncasecmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } } -static void check_supported_versions(struct unicode_map *um) +static void check_supported_versions(struct kunit *test) { + struct unicode_map *um = test->priv; /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(um, UTF8_LATEST)); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } -static int __init init_test_ucd(void) +static struct kunit_case unicode_normalization_test_cases[] = { + KUNIT_CASE(check_supported_versions), + KUNIT_CASE(check_utf8_comparisons), + KUNIT_CASE(check_utf8_nfdicf), + KUNIT_CASE(check_utf8_nfdi), + {} +}; + +static int init_test_ucd(struct kunit *test) { - struct unicode_map *um; + struct unicode_map *um = utf8_load(UTF8_LATEST); - failed_tests = 0; - total_tests = 0; + test->priv = um; - um = utf8_load(UTF8_LATEST); - if (IS_ERR(um)) { - pr_err("%s: Unable to load utf8 table.\n", __func__); - return PTR_ERR(um); - } + KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, + "%s: Unable to load utf8 table.\n", __func__); - check_supported_versions(um); - check_utf8_nfdi(um); - check_utf8_nfdicf(um); - check_utf8_comparisons(um); - - if (!failed_tests) - pr_info("All %u tests passed\n", total_tests); - else - pr_err("%u out of %u tests failed\n", failed_tests, - total_tests); - utf8_unload(um); return 0; } -static void __exit exit_test_ucd(void) +static void exit_test_ucd(struct kunit *test) { + utf8_unload(test->priv); } -module_init(init_test_ucd); -module_exit(exit_test_ucd); +static struct kunit_suite unicode_normalization_test_suite = { + .name = "unicode_normalization", + .test_cases = unicode_normalization_test_cases, + .init = init_test_ucd, + .exit = exit_test_ucd, +}; + +kunit_test_suite(unicode_normalization_test_suite); + MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); -MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); +MODULE_DESCRIPTION("KUnit tests for utf-8 support."); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 768f8ab448b8..7b998c99c88d 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -586,7 +586,7 @@ ccc_mismatch: } } -#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index e1036e535352..40569d3527a7 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -4,13 +4,9 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO select CRYPTO_HASH_INFO - # SHA-256 is implied as it's intended to be the default hash algorithm. + # SHA-256 is selected as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - # Note that CRYPTO_SHA256 denotes the generic C implementation, but - # some architectures provided optimized implementations of the same - # algorithm that may be used instead. In this case, CRYPTO_SHA256 may - # be omitted even if SHA-256 is being used. - imply CRYPTO_SHA256 + select CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7afa51e41427..5bf501cf8271 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ xfs_rtgroup.o \ + xfs_zones.o \ ) # highlevel code @@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_zone_alloc.o \ + xfs_zone_gc.o \ + xfs_zone_info.o \ + xfs_zone_space_resv.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index b59cb461e096..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -301,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ef19f1469ec..63255820b58a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,13 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -171,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -2572,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); - - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -4039,144 +3897,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - uint64_t fdblocks; - int error; - xfs_fileoff_t aoff; - bool use_cowextszhint = - whichfork == XFS_COW_FORK && !prealloc; - -retry: - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - aoff = off; - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* - * If we're targetting the COW fork but aren't creating a speculative - * posteof preallocation, try to expand the reservation to align with - * the COW extent size hint if there's sufficient free space. - * - * Unlike the data fork, the CoW cancellation functions will free all - * the reservations at inactivation, so we don't require that every - * delalloc reservation have a dirty pagecache. - */ - if (use_cowextszhint) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - goto out; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - fdblocks = indlen; - if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); - if (error) - goto out_unreserve_quota; - } else { - fdblocks += alen; - } - - error = xfs_dec_fdblocks(mp, fdblocks, false); - if (error) - goto out_unreserve_frextents; - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip, alen, indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_frextents: - if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); -out: - if (error == -ENOSPC || error == -EDQUOT) { - trace_xfs_delalloc_enospc(ip, off, len); - - if (prealloc || use_cowextszhint) { - /* retry without any preallocation */ - use_cowextszhint = false; - prealloc = 0; - goto retry; - } - } - return error; -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay( int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; fdblocks = da_diff; - if (isrt) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); - else + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { fdblocks += del->br_blockcount; + } xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); @@ -5670,7 +5399,8 @@ __xfs_bunmapi( delete: if (wasdel) { - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b721d935994..b4d9c6e0f3f9 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b1007fb661ba..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -178,9 +178,10 @@ typedef struct xfs_sb { xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ - uint8_t sb_rgblklog; /* rt group number shift */ uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -270,9 +271,10 @@ struct xfs_dsb { __be64 sb_metadirino; /* metadata directory tree root */ __be32 sb_rgcount; /* # of realtime groups */ __be32 sb_rgextents; /* size of rtgroup in rtx */ - __u8 sb_rgblklog; /* rt group number shift */ __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ /* * The size of this structure must be padded to 64 bit alignment. @@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ XFS_SB_FEAT_INCOMPAT_PARENT | \ - XFS_SB_FEAT_INCOMPAT_METADIR) + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -952,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2c3171262b44..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -189,7 +189,9 @@ struct xfs_fsop_geom { uint32_t checked; /* o: checked fs & rt metadata */ __u32 rgextents; /* rt extents in a realtime group */ __u32 rgcount; /* number of realtime groups */ - __u64 reserved[16]; /* reserved space */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 242b05627c7a..4423932a2313 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -19,10 +19,23 @@ struct xfs_group { #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Track freed but not yet committed extents. - */ - struct xfs_extent_busy_tree *xg_busy_extents; + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; /* * Bitsets of per-ag metadata that have been checked and/or are sick. @@ -107,9 +120,15 @@ xfs_gbno_to_daddr( xfs_agblock_t gbno) { struct xfs_mount *mp = xg->xg_mount; - uint32_t blocks = mp->m_groups[xg->xg_type].blocks; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); } static inline uint32_t diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f3a840a425f5..0c47b5c6ca7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -364,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -1927,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f24fa628fecf..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -137,7 +137,7 @@ xfs_imap_to_bp( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); @@ -252,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -349,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -752,11 +756,18 @@ xfs_dinode_verify( !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index deb0b7c00a1f..48fe49a5f050 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -322,6 +322,7 @@ xfs_inode_init( if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a472ac2e45d0..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -475,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 2f5f554a36d4..225923e463c4 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -21,6 +21,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" static const struct { enum xfs_metafile_type mtype; @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( } /* - * Is the amount of space that could be allocated towards a given metadata - * file at or beneath a certain threshold? + * Is the metafile reservations at or beneath a certain threshold? */ static inline bool xfs_metafile_resv_can_cover( - struct xfs_inode *ip, + struct xfs_mount *mp, int64_t rhs) { /* @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( * global free block count. Take care of the first case to avoid * touching the per-cpu counter. */ - if (ip->i_delayed_blks >= rhs) + if (mp->m_metafile_resv_avail >= rhs) return true; /* * There aren't enough blocks left in the inode's reservation, but it * isn't critical unless there also isn't enough free space. */ - return __percpu_counter_compare(&ip->i_mount->m_fdblocks, - rhs - ip->i_delayed_blks, 2048) >= 0; + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; } /* - * Is this metadata file critically low on blocks? For now we'll define that - * as the number of blocks we can get our hands on being less than 10% of what - * we reserved or less than some arbitrary number (maximum btree height). + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). */ bool xfs_metafile_resv_critical( - struct xfs_inode *ip) + struct xfs_mount *mp) { - uint64_t asked_low_water; + ASSERT(xfs_has_metadir(mp)); - if (!ip) - return false; - - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_critical(ip, 0); + trace_xfs_metafile_resv_critical(mp, 0); - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) return true; - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, ip->i_mount, - XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( struct xfs_inode *ip, struct xfs_alloc_arg *args) { + struct xfs_mount *mp = ip->i_mount; int64_t len = args->len; ASSERT(xfs_is_metadir_inode(ip)); ASSERT(args->resv == XFS_AG_RESV_METAFILE); - trace_xfs_metafile_resv_alloc_space(ip, args->len); + trace_xfs_metafile_resv_alloc_space(mp, args->len); /* * Allocate the blocks from the metadata inode's block reservation * and update the ondisk sb counter. */ - if (ip->i_delayed_blks > 0) { + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { int64_t from_resv; - from_resv = min_t(int64_t, len, ip->i_delayed_blks); - ip->i_delayed_blks -= from_resv; + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; xfs_mod_delalloc(ip, 0, -from_resv); xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -from_resv); @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( xfs_trans_mod_sb(args->tp, field, -len); } + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + ip->i_nblocks += args->len; xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); } @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( struct xfs_trans *tp, xfs_filblks_t len) { + struct xfs_mount *mp = ip->i_mount; int64_t to_resv; ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free_space(ip, len); + + trace_xfs_metafile_resv_free_space(mp, len); ip->i_nblocks -= len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + /* * Add the freed blocks back into the inode's delalloc reservation * until it reaches the maximum size. Update the ondisk fdblocks only. */ - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); if (to_resv > 0) { to_resv = min_t(int64_t, to_resv, len); - ip->i_delayed_blks += to_resv; + mp->m_metafile_resv_avail += to_resv; xfs_mod_delalloc(ip, 0, to_resv); xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); len -= to_resv; } + mutex_unlock(&mp->m_metafile_resv_lock); /* * Everything else goes back to the filesystem, so update the in-core @@ -215,61 +224,99 @@ xfs_metafile_resv_free_space( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); } -/* Release a metadata file's space reservation. */ +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ void xfs_metafile_resv_free( - struct xfs_inode *ip) + struct xfs_mount *mp) { - /* Non-btree metadata inodes don't need space reservations. */ - if (!ip || !ip->i_meta_resv_asked) + if (!xfs_has_metadir(mp)) return; - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free(ip, 0); + trace_xfs_metafile_resv_free(mp, 0); - if (ip->i_delayed_blks) { - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); - ip->i_delayed_blks = 0; - } - ip->i_meta_resv_asked = 0; + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); } -/* Set up a metadata file's space reservation. */ +/* Set up a metafile space reservation. */ int xfs_metafile_resv_init( - struct xfs_inode *ip, - xfs_filblks_t ask) + struct xfs_mount *mp) { + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; xfs_filblks_t hidden_space; - xfs_filblks_t used; - int error; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; - if (!ip || ip->i_meta_resv_asked > 0) + if (!xfs_has_metadir(mp)) return 0; - ASSERT(xfs_is_metadir_inode(ip)); + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; /* - * Space taken by all other metadata btrees are accounted on-disk as + * Space taken by the per-AG metadata btrees are accounted on-disk as * used space. We therefore only hide the space that is reserved but * not used by the trees. */ - used = ip->i_nblocks; - if (used > ask) - ask = used; - hidden_space = ask - used; + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); - return error; + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; } - xfs_mod_delalloc(ip, 0, hidden_space); - ip->i_delayed_blks = hidden_space; - ip->i_meta_resv_asked = ask; + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); - trace_xfs_metafile_resv_init(ip, ask); - return 0; +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; } diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h index 95af4b52e5a7..ae6f9e779b98 100644 --- a/fs/xfs/libxfs/xfs_metafile.h +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); /* Space reservations for metadata inodes. */ struct xfs_alloc_arg; -bool xfs_metafile_resv_critical(struct xfs_inode *ip); +bool xfs_metafile_resv_critical(struct xfs_mount *mp); void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, struct xfs_alloc_arg *args); void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, xfs_filblks_t len); -void xfs_metafile_resv_free(struct xfs_inode *ip); -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); /* Code specific to kernel/userspace; must be provided externally. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index a85ecddaa48e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void) 16299260424LL); /* superblock field checks we got from xfs/122 */ - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); XFS_CHECK_SB_OFFSET(sb_magicnum, 0); XFS_CHECK_SB_OFFSET(sb_blocksize, 4); XFS_CHECK_SB_OFFSET(sb_dblocks, 8); @@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_SB_OFFSET(sb_rgextents, 276); XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 770adf60dd73..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1123,6 +1123,7 @@ xfs_rtfree_blocks( xfs_extlen_t mod; int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); mod = xfs_blen_to_rtxoff(mp, rtlen); @@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range( end = min(end, rtg->rtg_extents - 1); + if (xfs_has_zoned(mp)) + return -EINVAL; + /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { struct xfs_rtalloc_rec rec; @@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { + if (xfs_has_zoned(mp)) + return 0; return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } @@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount( xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + *rsumlevels = xfs_compute_rextslog(rextents) + 1; rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); return howmany_64(rsumwords, mp->m_blockwsize); diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index d84d32f1b48f..9186c58e83d5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -194,15 +194,17 @@ xfs_rtgroup_lock( ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || !(rtglock_flags & XFS_RTGLOCK_BITMAP)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - /* - * Lock both realtime free space metadata inodes for a freespace - * update. - */ - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) @@ -228,11 +230,13 @@ xfs_rtgroup_unlock( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } } @@ -249,7 +253,8 @@ xfs_rtgroup_trans_join( ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); } @@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry( /* Fill out form. */ memset(rgeo, 0, sizeof(*rgeo)); rgeo->rg_number = rtg_rgno(rtg); - rgeo->rg_length = rtg_group(rtg)->xg_block_count; + rgeo->rg_length = rtg_blocks(rtg); xfs_rtgroup_geom_health(rtg, rgeo); return 0; } @@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_BITMAP, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtbitmap_create, }, [XFS_RTGI_SUMMARY] = { @@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_SUMMARY, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtsummary_create, }, [XFS_RTGI_RMAP] = { diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 03f39d4e43fc..d36a6ae0abe5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -37,15 +37,33 @@ struct xfs_rtgroup { xfs_rtxnum_t rtg_extents; /* - * Cache of rt summary level per bitmap block with the invariant that - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, - * or 0 if rsum[i][bbno] == 0 for all i. - * + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. */ - uint8_t *rtg_rsum_cache; + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; }; +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) return rtg->rtg_group.xg_gno; } +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) { return rtg->rtg_inodes[XFS_RTGI_BITMAP]; @@ -222,10 +245,14 @@ xfs_rtb_to_daddr( xfs_rtblock_t rtbno) { struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); } static inline xfs_rtblock_t @@ -233,10 +260,11 @@ xfs_daddr_to_rtb( struct xfs_mount *mp, xfs_daddr_t daddr) { - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; - if (xfs_has_rtgroups(mp)) { - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { xfs_rgnumber_t rgno; uint32_t rgbno; diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index e4ec36943cb7..9bdc2cbfc113 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb( xfs_btree_del_cursor(cur, error); return error; } + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h index 9d0915089891..e328fd62a149 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.h +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, struct xfs_buftarg *btp, xfs_rgnumber_t rgno); +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + #endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 3dc5f5dba162..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -185,6 +185,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_PARENT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -266,6 +268,9 @@ static uint64_t xfs_expected_rbmblocks( struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; return howmany_64(xfs_extents_per_rbm(sbp), NBBY * xfs_rtbmblock_size(sbp)); } @@ -275,9 +280,15 @@ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) - return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || @@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups( return 0; } +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -523,6 +562,11 @@ xfs_validate_sb_common( if (error) return error; } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -835,6 +879,14 @@ __xfs_sb_from_disk( to->sb_rgcount = 1; to->sb_rgextents = 0; } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -1001,6 +1053,11 @@ xfs_sb_to_disk( to->sb_rbmino = cpu_to_be64(0); to->sb_rsumino = cpu_to_be64(0); } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize( rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; rgs->blklog = mp->m_sb.sb_rgblklog; rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; } else { rgs->blocks = 0; rgs->blklog = 0; @@ -1265,8 +1326,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = - percpu_counter_sum_positive(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); } /* @@ -1275,9 +1335,10 @@ xfs_log_sb( * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. */ - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = - percpu_counter_sum_positive(&mp->m_frextents); + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); @@ -1510,6 +1571,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; if (xfs_has_metadir(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1530,6 +1593,10 @@ xfs_fs_geometry( geo->rgcount = sbp->sb_rgcount; geo->rgextents = sbp->sb_rgextents; } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index ca2401c1facd..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -233,6 +233,34 @@ enum xfs_group_type { { XG_TYPE_AG, "ag" }, \ { XG_TYPE_RTG, "rtg" } +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 9f8c312dfd3c..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -69,6 +69,8 @@ STATIC size_t xchk_superblock_ondisk_size( struct xfs_mount *mp) { + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); if (xfs_has_metadir(mp)) return offsetofend(struct xfs_dsb, sb_pad); if (xfs_has_metauuid(mp)) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 66da7d4d56ba..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1038,8 +1038,8 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ca23cf4db6c5..e629663e460a 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -350,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -413,7 +413,13 @@ xchk_fscount_count_frextents( fsc->frextents = 0; fsc->frextents_delayed = 0; - if (!xfs_has_realtime(mp)) + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; while ((rtg = xfs_rtgroup_next(mp, rtg))) { @@ -513,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -589,15 +595,17 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index cda13447a373..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,7 +64,7 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); /* * Online repair is only supported on v5 file systems, which require @@ -74,10 +74,12 @@ xrep_fscounters( * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ - percpu_counter_set(&mp->m_frextents, - fsc->frextents - fsc->frextents_delayed); - if (!xfs_has_rtgroups(mp)) - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index db6edd5a5fe5..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -273,6 +273,13 @@ xchk_inode_cowextsize( xfs_failaddr_t fa; uint32_t value = be32_to_cpu(dip->di_cowextsize); + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 13ff1c933cb8..a90a011c7e5f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -710,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), @@ -1558,8 +1560,7 @@ xrep_dinode_core( /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, - NULL); + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index ac38f5843090..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -62,7 +62,7 @@ xrep_newbt_estimate_slack( free = sc->sa.pag->pagf_freeblks; sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b32fb233cf84..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) - return xrep_defer_finish(sc); + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } - return 0; + return xrep_reset_metafile_resv(sc); } /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3b5288d3ef4e..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -43,6 +43,7 @@ #include "xfs_rtalloc.h" #include "xfs_metafile.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse( xfs_rtxnum_t startrtx; xfs_rtxnum_t endrtx; bool is_free = false; - int error; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } startrtx = xfs_rgbno_to_rtx(mp, rgbno); endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); @@ -1386,11 +1393,12 @@ int xrep_reset_metafile_resv( struct xfs_scrub *sc) { - struct xfs_inode *ip = sc->ip; + struct xfs_mount *mp = sc->mp; int64_t delta; int error; - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; if (delta == 0) return 0; @@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv( if (delta > 0) { int64_t give_back; - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); if (give_back > 0) { - xfs_mod_delalloc(ip, 0, -give_back); - xfs_add_fdblocks(ip->i_mount, give_back); - ip->i_delayed_blks -= give_back; + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; } return 0; @@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv( /* * Not enough reservation; try to take some blocks from the filesystem - * to the metadata inode. @delta is negative here, so invert the sign. + * to the metabtree reservation. */ - delta = -delta; - error = xfs_dec_fdblocks(sc->mp, delta, true); + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); while (error == -ENOSPC) { delta--; if (delta == 0) { xfs_warn(sc->mp, -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", - ip->i_ino); +"Insufficient free space to reset metabtree reservation after repair."); return 0; } - error = xfs_dec_fdblocks(sc->mp, delta, true); + error = xfs_dec_fdblocks(mp, delta, true); } if (error) return error; - xfs_mod_delalloc(ip, 0, delta); - ip->i_delayed_blks += delta; + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; return 0; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e8c776a34c1d..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rtrmap_btree.h" #include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space( xfs_extlen_t len) { struct xfs_rtgroup *rtg = sc->sr.rtg; - struct xfs_inode *rbmip = rtg_bitmap(rtg); xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, @@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space( if (!xchk_should_check_xref(sc, &error, NULL)) return; if (is_free) - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c index 257cfb24beb4..983362447826 100644 --- a/fs/xfs/scrub/rtrefcount_repair.c +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -697,32 +697,6 @@ err_cur: return error; } -/* - * Now that we've logged the roots of the new btrees, invalidate all of the - * old blocks and free them. - */ -STATIC int -xrep_rtrefc_remove_old_tree( - struct xrep_rtrefc *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrefcountbt - * and aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, - &rr->old_rtrefcountbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrefcount inode so that we - * don't fail to expand the btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - /* Rebuild the rt refcount btree. */ int xrep_rtrefcountbt( @@ -769,8 +743,12 @@ xrep_rtrefcountbt( if (error) goto out_bitmap; - /* Kill the old tree. */ - error = xrep_rtrefc_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); if (error) goto out_bitmap; diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index f2fdd7a9fc24..fc2592c53af5 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -810,28 +810,6 @@ err_cur: /* Reaping the old btree. */ -/* Reap the old rtrmapbt blocks. */ -STATIC int -xrep_rtrmap_remove_old_tree( - struct xrep_rtrmap *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrmapbt and - * aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrmap inode so that we don't - * fail to expand the new btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - static inline bool xrep_rtrmapbt_want_live_update( struct xchk_iscan *iscan, @@ -995,8 +973,11 @@ xrep_rtrmapbt( if (error) goto out_records; - /* Kill the old tree. */ - error = xrep_rtrmap_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); if (error) goto out_records; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6fa9e3e5bab7..9908850bf76f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5077d52a775d..26a04a783489 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -20,6 +20,8 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_icache.h" +#include "xfs_zone_alloc.h" +#include "xfs_rtgroup.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -77,6 +79,26 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +static void +xfs_ioend_put_open_zones( + struct iomap_ioend *ioend) +{ + struct iomap_ioend *tmp; + + /* + * Put the open zone for all ioends merged into this one (if any). + */ + list_for_each_entry(tmp, &ioend->io_list, io_list) + xfs_open_zone_put(tmp->io_private); + + /* + * The main ioend might not have an open zone if the submission failed + * before xfs_zone_alloc_and_submit got called. + */ + if (ioend->io_private) + xfs_open_zone_put(ioend->io_private); +} + /* * IO write completion. */ @@ -86,6 +108,7 @@ xfs_end_ioend( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + bool is_zoned = xfs_is_zoned_inode(ip); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -116,9 +139,10 @@ xfs_end_ioend( error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_IOEND_SHARED) { + ASSERT(!is_zoned); xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, - offset + size); + offset + size, NULL); } goto done; } @@ -126,14 +150,21 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_IOEND_SHARED) + if (is_zoned) + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, + ioend->io_private, NULLFSBLOCK); + else if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - if (!error && xfs_ioend_is_append(ioend)) + if (!error && + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && + xfs_ioend_is_append(ioend)) error = xfs_setfilesize(ip, offset, size); done: + if (is_zoned) + xfs_ioend_put_open_zones(ioend); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -176,17 +207,27 @@ xfs_end_io( } } -STATIC void +void xfs_end_bio( struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; unsigned long flags; + /* + * For Appends record the actually written block number and set the + * boundary flag if needed. + */ + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { + ioend->io_sector = bio->bi_iter.bi_sector; + xfs_mark_rtg_boundary(ioend); + } + spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, &ip->i_ioend_work)); list_add_tail(&ioend->io_list, &ip->i_ioend_list); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); @@ -463,7 +504,7 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + folio_size(folio), NULL); } static const struct iomap_writeback_ops xfs_writeback_ops = { @@ -472,15 +513,125 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { .discard_folio = xfs_discard_folio, }; +struct xfs_zoned_writepage_ctx { + struct iomap_writepage_ctx ctx; + struct xfs_open_zone *open_zone; +}; + +static inline struct xfs_zoned_writepage_ctx * +XFS_ZWPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); +} + +static int +xfs_zoned_map_blocks( + struct iomap_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int len) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_filblks_t count_fsb; + struct xfs_bmbt_irec imap, del; + struct xfs_iext_cursor icur; + + if (xfs_is_shutdown(mp)) + return -EIO; + + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + + /* + * All dirty data must be covered by delalloc extents. But truncate can + * remove delalloc extents underneath us or reduce their size. + * Returning a hole tells iomap to not write back any data from this + * range, which is the right thing to do in that case. + * + * Otherwise just tell iomap to treat ranges previously covered by a + * delalloc extent as mapped. The actual block allocation will be done + * just before submitting the bio. + * + * This implies we never map outside folios that are locked or marked + * as under writeback, and thus there is no need check the fork sequence + * count here. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + if (imap.br_startoff > offset_fsb) { + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); + return 0; + } + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + count_fsb = end_fsb - offset_fsb; + + del = imap; + xfs_trim_extent(&del, offset_fsb, count_fsb); + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, + XFS_BMAPI_REMAP); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + wpc->iomap.type = IOMAP_MAPPED; + wpc->iomap.flags = IOMAP_F_DIRTY; + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; + wpc->iomap.offset = offset; + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); + wpc->iomap.flags = IOMAP_F_ANON_WRITE; + + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); + return 0; +} + +static int +xfs_zoned_submit_ioend( + struct iomap_writepage_ctx *wpc, + int status) +{ + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; + if (status) + return status; + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + return 0; +} + +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { + .map_blocks = xfs_zoned_map_blocks, + .submit_ioend = xfs_zoned_submit_ioend, + .discard_folio = xfs_discard_folio, +}; + STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + if (xfs_is_zoned_inode(ip)) { + struct xfs_zoned_writepage_ctx xc = { }; + int error; + + error = iomap_writepages(mapping, wbc, &xc.ctx, + &xfs_zoned_writeback_ops); + if (xc.open_zone) + xfs_open_zone_put(xc.open_zone); + return error; + } else { + struct xfs_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc.ctx, + &xfs_writeback_ops); + } } STATIC int diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e0bd68419764..5a7a0f1a0b49 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,6 +9,7 @@ extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +void xfs_end_bio(struct bio *bio); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0836fea2d6d8..06ca11731e43 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -30,6 +30,7 @@ #include "xfs_reflink.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ @@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, - xfs_off_t end_byte) + xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + if (xfs_is_zoned_inode(ip) && ac) { + /* + * In a zoned buffered write context we need to return + * the punched delalloc allocations to the allocation + * context. This allows reusing them in the following + * iomap iterations. + */ + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, XFS_BMAPI_REMAP); + ac->reserved_blocks += del.br_blockcount; + } else { + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, 0); + } + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -582,7 +598,7 @@ xfs_free_eofblocks( if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), - LLONG_MAX); + LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -825,7 +841,8 @@ int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; @@ -880,7 +897,7 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = xfs_zero_range(ip, offset, len, NULL); + error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; @@ -968,7 +985,8 @@ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -981,7 +999,7 @@ xfs_collapse_file_space( trace_xfs_collapse_file_space(ip); - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1a..c477b3361630 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -15,6 +15,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +struct xfs_zone_alloc_ctx; #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, - xfs_off_t start_byte, xfs_off_t end_byte); + xfs_off_t start_byte, xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5d560e9073f4..8e7f1b324b3b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -55,27 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } -static inline int -xfs_buf_is_vmapped( - struct xfs_buf *bp) -{ - /* - * Return true if the buffer is vmapped. - * - * b_addr is null if the buffer is not mapped, but the code is clever - * enough to know it doesn't have to map a single page, so the check has - * to be both for b_addr and bp->b_page_count > 1. - */ - return bp->b_addr && bp->b_page_count > 1; -} - -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) -{ - return (bp->b_page_count * PAGE_SIZE); -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -109,38 +88,168 @@ xfs_buf_stale( spin_unlock(&bp->b_lock); } +static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + if (bp->b_maps != &bp->__b_map) + kfree(bp->b_maps); + kmem_cache_free(xfs_buf_cache, bp); +} + +static void +xfs_buf_free( + struct xfs_buf *bp) +{ + unsigned int size = BBTOB(bp->b_length); + + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); + + if (is_vmalloc_addr(bp->b_addr)) + vfree(bp->b_addr); + else if (bp->b_flags & _XBF_KMEM) + kfree(bp->b_addr); + else + folio_put(virt_to_folio(bp->b_addr)); + + call_rcu(&bp->b_rcu, xfs_buf_free_callback); +} + static int -xfs_buf_get_maps( +xfs_buf_alloc_kmem( struct xfs_buf *bp, - int map_count) + size_t size, + gfp_t gfp_mask) { - ASSERT(bp->b_maps == NULL); - bp->b_map_count = map_count; + ASSERT(is_power_of_2(size)); + ASSERT(size < PAGE_SIZE); - if (map_count == 1) { - bp->b_maps = &bp->__b_map; - return 0; - } + bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); + if (!bp->b_addr) + return -ENOMEM; - bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - if (!bp->b_maps) + /* + * Slab guarantees that we get back naturally aligned allocations for + * power of two sizes. Keep this check as the canary in the coal mine + * if anything changes in slab. + */ + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { + kfree(bp->b_addr); + bp->b_addr = NULL; return -ENOMEM; + } + bp->b_flags |= _XBF_KMEM; + trace_xfs_buf_backing_kmem(bp, _RET_IP_); return 0; } -static void -xfs_buf_free_maps( - struct xfs_buf *bp) +/* + * Allocate backing memory for a buffer. + * + * For tmpfs-backed buffers used by in-memory btrees this directly maps the + * tmpfs page cache folios. + * + * For real file system buffers there are three different kinds backing memory: + * + * The first type backs the buffer by a kmalloc allocation. This is done for + * less than PAGE_SIZE allocations to avoid wasting memory. + * + * The second type is a single folio buffer - this may be a high order folio or + * just a single page sized folio, but either way they get treated the same way + * by the rest of the code - the buffer memory spans a single contiguous memory + * region that we don't have to map and unmap to access the data directly. + * + * The third type of buffer is the vmalloc()d buffer. This provides the buffer + * with the required contiguous memory region but backed by discontiguous + * physical pages. + */ +static int +xfs_buf_alloc_backing_mem( + struct xfs_buf *bp, + xfs_buf_flags_t flags) { - if (bp->b_maps != &bp->__b_map) { - kfree(bp->b_maps); - bp->b_maps = NULL; + size_t size = BBTOB(bp->b_length); + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; + struct folio *folio; + + if (xfs_buftarg_is_mem(bp->b_target)) + return xmbuf_map_backing_mem(bp); + + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) + gfp_mask |= __GFP_ZERO; + + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + + /* + * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that + * is properly aligned. The slab allocator now guarantees an aligned + * allocation for all power of two sizes, which matches most of the + * smaller than PAGE_SIZE buffers used by XFS. + */ + if (size < PAGE_SIZE && is_power_of_2(size)) + return xfs_buf_alloc_kmem(bp, size, gfp_mask); + + /* + * Don't bother with the retry loop for single PAGE allocations: vmalloc + * won't do any better. + */ + if (size <= PAGE_SIZE) + gfp_mask |= __GFP_NOFAIL; + + /* + * Optimistically attempt a single high order folio allocation for + * larger than PAGE_SIZE buffers. + * + * Allocating a high order folio makes the assumption that buffers are a + * power-of-2 size, matching the power-of-2 folios sizes available. + * + * The exception here are user xattr data buffers, which can be arbitrarily + * sized up to 64kB plus structure metadata, skip straight to the vmalloc + * path for them instead of wasting memory here. + */ + if (size > PAGE_SIZE) { + if (!is_power_of_2(size)) + goto fallback; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + gfp_mask |= __GFP_NORETRY; } + folio = folio_alloc(gfp_mask, get_order(size)); + if (!folio) { + if (size <= PAGE_SIZE) + return -ENOMEM; + trace_xfs_buf_backing_fallback(bp, _RET_IP_); + goto fallback; + } + bp->b_addr = folio_address(folio); + trace_xfs_buf_backing_folio(bp, _RET_IP_); + return 0; + +fallback: + for (;;) { + bp->b_addr = __vmalloc(size, gfp_mask); + if (bp->b_addr) + break; + if (flags & XBF_READ_AHEAD) + return -ENOMEM; + XFS_STATS_INC(bp->b_mount, xb_page_retries); + memalloc_retry_wait(gfp_mask); + } + + trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); + return 0; } static int -_xfs_buf_alloc( +xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -159,7 +268,7 @@ _xfs_buf_alloc( * We don't want certain flags to appear in b_flags unless they are * specifically set by later operations on the buffer. */ - flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); /* * A new buffer is held and locked by the owner. This ensures that the @@ -179,15 +288,14 @@ _xfs_buf_alloc( bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; - - error = xfs_buf_get_maps(bp, nmaps); - if (error) { - kmem_cache_free(xfs_buf_cache, bp); - return error; - } - bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; + bp->b_map_count = nmaps; + if (nmaps == 1) + bp->b_maps = &bp->__b_map; + else + bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; bp->b_maps[i].bm_len = map[i].bm_len; @@ -200,195 +308,13 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - *bpp = bp; - return 0; -} - -static void -xfs_buf_free_pages( - struct xfs_buf *bp) -{ - uint i; - - ASSERT(bp->b_flags & _XBF_PAGES); - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr, bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - if (bp->b_pages[i]) - __free_page(bp->b_pages[i]); - } - mm_account_reclaimed_pages(bp->b_page_count); - - if (bp->b_pages != bp->b_page_array) - kfree(bp->b_pages); - bp->b_pages = NULL; - bp->b_flags &= ~_XBF_PAGES; -} - -static void -xfs_buf_free_callback( - struct callback_head *cb) -{ - struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); - - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); -} - -static void -xfs_buf_free( - struct xfs_buf *bp) -{ - trace_xfs_buf_free(bp, _RET_IP_); - - ASSERT(list_empty(&bp->b_lru)); - - if (xfs_buftarg_is_mem(bp->b_target)) - xmbuf_unmap_page(bp); - else if (bp->b_flags & _XBF_PAGES) - xfs_buf_free_pages(bp); - else if (bp->b_flags & _XBF_KMEM) - kfree(bp->b_addr); - - call_rcu(&bp->b_rcu, xfs_buf_free_callback); -} - -static int -xfs_buf_alloc_kmem( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; - size_t size = BBTOB(bp->b_length); - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - bp->b_addr = kmalloc(size, gfp_mask); - if (!bp->b_addr) - return -ENOMEM; - - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ - kfree(bp->b_addr); - bp->b_addr = NULL; - return -ENOMEM; - } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; - bp->b_flags |= _XBF_KMEM; - return 0; -} - -static int -xfs_buf_alloc_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; - long filled = 0; - - if (flags & XBF_READ_AHEAD) - gfp_mask |= __GFP_NORETRY; - - /* Make sure that we have a page list */ - bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); - if (bp->b_page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, - gfp_mask); - if (!bp->b_pages) - return -ENOMEM; - } - bp->b_flags |= _XBF_PAGES; - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - /* - * Bulk filling of pages can take multiple calls. Not filling the entire - * array is not an allocation failure, so don't back off if we get at - * least one extra page. - */ - for (;;) { - long last = filled; - - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, - bp->b_pages); - if (filled == bp->b_page_count) { - XFS_STATS_INC(bp->b_mount, xb_page_found); - break; - } - - if (filled != last) - continue; - - if (flags & XBF_READ_AHEAD) { - xfs_buf_free_pages(bp); - return -ENOMEM; - } - - XFS_STATS_INC(bp->b_mount, xb_page_retries); - memalloc_retry_wait(gfp_mask); - } - return 0; -} - -/* - * Map buffer into kernel address-space if necessary. - */ -STATIC int -_xfs_buf_map_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - ASSERT(bp->b_flags & _XBF_PAGES); - if (bp->b_page_count == 1) { - /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]); - } else if (flags & XBF_UNMAPPED) { - bp->b_addr = NULL; - } else { - int retried = 0; - unsigned nofs_flag; - - /* - * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations - * from the same call site that can be run from both above and - * below memory reclaim causes lockdep false positives. Hence we - * always need to force this allocation to nofs context because - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to - * prevent false positive lockdep reports. - * - * XXX(dgc): I think dquot reclaim is the only place we can get - * to this function from memory reclaim context now. If we fix - * that like we've fixed inode reclaim to avoid writeback from - * reclaim, this nofs wrapping can go away. - */ - nofs_flag = memalloc_nofs_save(); - do { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1); - if (bp->b_addr) - break; - vm_unmap_aliases(); - } while (retried++ <= 1); - memalloc_nofs_restore(nofs_flag); - - if (!bp->b_addr) - return -ENOMEM; + error = xfs_buf_alloc_backing_mem(bp, flags); + if (error) { + xfs_buf_free(bp); + return error; } + *bpp = bp; return 0; } @@ -507,7 +433,7 @@ xfs_buf_find_lock( return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; @@ -575,25 +501,10 @@ xfs_buf_find_insert( struct xfs_buf *bp; int error; - error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); + error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) goto out_drop_pag; - if (xfs_buftarg_is_mem(new_bp->b_target)) { - error = xmbuf_map_page(new_bp); - } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { - /* - * For buffers that fit entirely within a single page, first - * attempt to allocate the memory from the heap to minimise - * memory usage. If we can't get heap memory for these small - * buffers, we fall back to using the page allocator. - */ - error = xfs_buf_alloc_pages(new_bp, flags); - } - if (error) - goto out_free_buf; - /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; @@ -704,18 +615,6 @@ xfs_buf_get_map( xfs_perag_put(pag); } - /* We do not hold a perag reference anymore. */ - if (!bp->b_addr) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - xfs_warn_ratelimited(btp->bt_mount, - "%s: failed to map %u pages", __func__, - bp->b_page_count); - xfs_buf_relse(bp); - return error; - } - } - /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. @@ -903,7 +802,6 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -912,7 +810,7 @@ xfs_buf_read_uncached( *bpp = NULL; - error = xfs_buf_get_uncached(target, numblks, flags, &bp); + error = xfs_buf_get_uncached(target, numblks, &bp); if (error) return error; @@ -938,42 +836,14 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; - struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - /* there are currently no valid flags for xfs_buf_get_uncached */ - ASSERT(flags == 0); - - *bpp = NULL; - - error = _xfs_buf_alloc(target, &map, 1, flags, &bp); - if (error) - return error; - - if (xfs_buftarg_is_mem(bp->b_target)) - error = xmbuf_map_page(bp); - else - error = xfs_buf_alloc_pages(bp, flags); - if (error) - goto fail_free_buf; - - error = _xfs_buf_map_pages(bp, 0); - if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pages", __func__); - goto fail_free_buf; - } - - trace_xfs_buf_get_uncached(bp, _RET_IP_); - *bpp = bp; - return 0; - -fail_free_buf: - xfs_buf_free(bp); + error = xfs_buf_alloc(target, &map, 1, 0, bpp); + if (!error) + trace_xfs_buf_get_uncached(*bpp, _RET_IP_); return error; } @@ -1299,9 +1169,9 @@ __xfs_buf_ioend( trace_xfs_buf_iodone(bp, _RET_IP_); if (bp->b_flags & XBF_READ) { - if (!bp->b_error && xfs_buf_is_vmapped(bp)) + if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) @@ -1462,29 +1332,48 @@ static void xfs_buf_submit_bio( struct xfs_buf *bp) { - unsigned int size = BBTOB(bp->b_length); - unsigned int map = 0, p; + unsigned int map = 0; struct blk_plug plug; struct bio *bio; - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, - xfs_buf_bio_op(bp), GFP_NOIO); - bio->bi_private = bp; - bio->bi_end_io = xfs_buf_bio_end_io; + if (is_vmalloc_addr(bp->b_addr)) { + unsigned int size = BBTOB(bp->b_length); + unsigned int alloc_size = roundup(size, PAGE_SIZE); + void *data = bp->b_addr; - if (bp->b_flags & _XBF_KMEM) { - __bio_add_page(bio, virt_to_page(bp->b_addr), size, - bp->b_offset); - } else { - for (p = 0; p < bp->b_page_count; p++) - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); - bio->bi_iter.bi_size = size; /* limit to the actual size used */ + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, + xfs_buf_bio_op(bp), GFP_NOIO); + + do { + unsigned int len = min(size, PAGE_SIZE); + + ASSERT(offset_in_page(data) == 0); + __bio_add_page(bio, vmalloc_to_page(data), len, 0); + data += len; + size -= len; + } while (size); - if (xfs_buf_is_vmapped(bp)) - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + flush_kernel_vmap_range(bp->b_addr, alloc_size); + } else { + /* + * Single folio or slab allocation. Must be contiguous and thus + * only a single bvec is needed. + * + * This uses the page based bio add helper for now as that is + * the lowest common denominator between folios and slab + * allocations. To be replaced with a better block layer + * helper soon (hopefully). + */ + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), + GFP_NOIO); + __bio_add_page(bio, virt_to_page(bp->b_addr), + BBTOB(bp->b_length), + offset_in_page(bp->b_addr)); } + bio->bi_private = bp; + bio->bi_end_io = xfs_buf_bio_end_io; + /* * If there is more than one map segment, split out a new bio for each * map except of the last one. The last map is handled by the @@ -1611,47 +1500,6 @@ xfs_buf_submit( xfs_buf_submit_bio(bp); } -void * -xfs_buf_offset( - struct xfs_buf *bp, - size_t offset) -{ - struct page *page; - - if (bp->b_addr) - return bp->b_addr + offset; - - page = bp->b_pages[offset >> PAGE_SHIFT]; - return page_address(page) + (offset & (PAGE_SIZE-1)); -} - -void -xfs_buf_zero( - struct xfs_buf *bp, - size_t boff, - size_t bsize) -{ - size_t bend; - - bend = boff + bsize; - while (boff < bend) { - struct page *page; - int page_index, page_offset, csize; - - page_index = (boff + bp->b_offset) >> PAGE_SHIFT; - page_offset = (boff + bp->b_offset) & ~PAGE_MASK; - page = bp->b_pages[page_index]; - csize = min_t(size_t, PAGE_SIZE - page_offset, - BBTOB(bp->b_length) - boff); - - ASSERT((csize + page_offset) <= PAGE_SIZE); - - memset(page_address(page) + page_offset, 0, csize); - - boff += csize; - } -} - /* * Log a message about and stale a buffer that a caller has decided is corrupt. * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 80e06eecaf56..d0b065a9a9f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -36,7 +36,6 @@ struct xfs_buf; #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ @@ -48,7 +47,6 @@ struct xfs_buf; #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -62,14 +60,12 @@ typedef unsigned int xfs_buf_flags_t; { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ - { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, \ - { XBF_UNMAPPED, "UNMAPPED" } + { XBF_TRYLOCK, "TRYLOCK" } /* * Internal state flags. @@ -124,8 +120,6 @@ struct xfs_buftarg { struct xfs_buf_cache bt_cache[]; }; -#define XB_PAGES 2 - struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ @@ -187,15 +181,10 @@ struct xfs_buf { struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset of b_addr, - only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ void (*b_iodone)(struct xfs_buf *bp); @@ -284,9 +273,9 @@ xfs_buf_readahead( } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp); + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); @@ -315,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); -void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ -extern void *xfs_buf_offset(struct xfs_buf *, size_t); +static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) +{ + return bp->b_addr + offset; +} + +static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) +{ + memset(bp->b_addr + boff, 0, bsize); +} + extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 47549cfa61cd..19eb0b7a3e58 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -57,24 +57,6 @@ xfs_buf_log_format_size( (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -static inline bool -xfs_buf_item_straddle( - struct xfs_buf *bp, - uint offset, - int first_bit, - int nbits) -{ - void *first, *last; - - first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); - last = xfs_buf_offset(bp, - offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); - - if (last - first != nbits * XFS_BLF_CHUNK) - return true; - return false; -} - /* * Return the number of log iovecs and space needed to log the given buf log * item segment. @@ -91,11 +73,8 @@ xfs_buf_item_size_segment( int *nvecs, int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; int first_bit; int nbits; - int next_bit; - int last_bit; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (first_bit == -1) @@ -108,15 +87,6 @@ xfs_buf_item_size_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - (*nvecs)++; *nbytes += nbits * XFS_BLF_CHUNK; @@ -131,40 +101,6 @@ xfs_buf_item_size_segment( } while (first_bit != -1); return; - -slow_scan: - /* Count the first bit we jumped out of the above loop from */ - (*nvecs)++; - *nbytes += XFS_BLF_CHUNK; - last_bit = first_bit; - while (last_bit != -1) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - last_bit + 1); - /* - * If we run out of bits, leave the loop, - * else if we find a new set of bits bump the number of vecs, - * else keep scanning the current set of bits. - */ - if (next_bit == -1) { - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - last_bit = next_bit; - first_bit = next_bit; - (*nvecs)++; - nbits = 1; - } else { - last_bit++; - nbits++; - } - *nbytes += XFS_BLF_CHUNK; - } } /* @@ -277,8 +213,6 @@ xfs_buf_item_format_segment( struct xfs_buf *bp = bip->bli_buf; uint base_size; int first_bit; - int last_bit; - int next_bit; uint nbits; /* copy the flags across from the base format item */ @@ -323,15 +257,6 @@ xfs_buf_item_format_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; @@ -347,45 +272,6 @@ xfs_buf_item_format_segment( } while (first_bit != -1); return; - -slow_scan: - ASSERT(bp->b_addr == NULL); - last_bit = first_bit; - nbits = 1; - for (;;) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - (uint)last_bit + 1); - /* - * If we run out of bits fill in the last iovec and get out of - * the loop. Else if we start a new set of bits then fill in - * the iovec for the series we were looking at and start - * counting the bits in the new one. Else we're still in the - * same set of bits so just keep counting and scanning. - */ - if (next_bit == -1) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else { - last_bit++; - nbits++; - } - } } /* diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 05a2f6927c12..d4c5cef5bc43 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - uint buf_flags; xfs_lsn_t lsn; /* @@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2( } trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); + 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 5b64a2b3b113..b4ffd80b7cb6 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -74,7 +74,7 @@ xmbuf_alloc( /* * We don't want to bother with kmapping data during repair, so don't - * allow highmem pages to back this mapping. + * allow highmem folios to back this mapping. */ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); @@ -127,14 +127,13 @@ xmbuf_free( kfree(btp); } -/* Directly map a shmem page into the buffer cache. */ +/* Directly map a shmem folio into the buffer cache. */ int -xmbuf_map_page( +xmbuf_map_backing_mem( struct xfs_buf *bp) { struct inode *inode = file_inode(bp->b_target->bt_file); struct folio *folio = NULL; - struct page *page; loff_t pos = BBTOB(xfs_buf_daddr(bp)); int error; @@ -159,39 +158,17 @@ xmbuf_map_page( return -EIO; } - page = folio_file_page(folio, pos >> PAGE_SHIFT); - /* - * Mark the page dirty so that it won't be reclaimed once we drop the - * (potentially last) reference in xmbuf_unmap_page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfs_buf_free. */ - set_page_dirty(page); - unlock_page(page); + folio_set_dirty(folio); + folio_unlock(folio); - bp->b_addr = page_address(page); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = page; - bp->b_page_count = 1; + bp->b_addr = folio_address(folio); return 0; } -/* Unmap a shmem page that was mapped into the buffer cache. */ -void -xmbuf_unmap_page( - struct xfs_buf *bp) -{ - struct page *page = bp->b_pages[0]; - - ASSERT(xfs_buftarg_is_mem(bp->b_target)); - - put_page(page); - - bp->b_addr = NULL; - bp->b_pages[0] = NULL; - bp->b_pages = NULL; - bp->b_page_count = 0; -} - /* Is this a valid daddr within the buftarg? */ bool xmbuf_verify_daddr( @@ -205,7 +182,7 @@ xmbuf_verify_daddr( return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); } -/* Discard the page backing this buffer. */ +/* Discard the folio backing this buffer. */ static void xmbuf_stale( struct xfs_buf *bp) @@ -220,7 +197,7 @@ xmbuf_stale( } /* - * Finalize a buffer -- discard the backing page if it's stale, or run the + * Finalize a buffer -- discard the backing folio if it's stale, or run the * write verifier to detect problems. */ int diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index eed4a7b63232..67d525cc1513 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr, struct xfs_buftarg **btpp); void xmbuf_free(struct xfs_buftarg *btp); -int xmbuf_map_page(struct xfs_buf *bp); -void xmbuf_unmap_page(struct xfs_buf *bp); bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); int xmbuf_finalize(struct xfs_buf *bp); #else # define xfs_buftarg_is_mem(...) (false) -# define xmbuf_map_page(...) (-ENOMEM) -# define xmbuf_unmap_page(...) ((void)0) # define xmbuf_verify_daddr(...) (false) #endif /* CONFIG_XFS_MEMORY_BUFS */ +int xmbuf_map_backing_mem(struct xfs_buf *bp); + #endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 3f2403a7b49c..c1a306268ae4 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -844,7 +844,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp && + + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) rt_bdev = mp->m_rtdev_targp->bt_bdev; if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ea43c9a6e54c..da3161572735 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -671,7 +671,7 @@ xfs_extent_busy_wait_all( while ((pag = xfs_perag_next(mp, pag))) xfs_extent_busy_wait_group(pag_group(pag)); - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_extent_busy_wait_group(rtg_group(rtg)); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a25c713ff888..777438b853da 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item( trace_xfs_extent_free_deferred(mp, xefi); - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { - if (*rtgp != to_rtg(xefi->xefi_group)) { - *rtgp = to_rtg(xefi->xefi_group); - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); - xfs_rtgroup_trans_join(tp, *rtgp, - XFS_RTGLOCK_BITMAP); - } - error = xfs_rtfree_blocks(tp, *rtgp, - xefi->xefi_startblock, xefi->xefi_blockcount); + if (xefi->xefi_flags & XFS_EFI_CANCELLED) + goto done; + + if (*rtgp != to_rtg(xefi->xefi_group)) { + unsigned int lock_flags; + + if (xfs_has_zoned(mp)) + lock_flags = XFS_RTGLOCK_RMAP; + else + lock_flags = XFS_RTGLOCK_BITMAP; + + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, lock_flags); + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); } + + if (xfs_has_zoned(mp)) { + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } else { + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } + if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; } - +done: xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(item); return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85b857805d6d..84f08c976ac4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -150,7 +152,7 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); @@ -360,7 +362,8 @@ xfs_file_write_zero_eof( struct iov_iter *from, unsigned int *iolock, size_t count, - bool *drained_dio) + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); loff_t isize; @@ -414,7 +417,7 @@ xfs_file_write_zero_eof( trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; @@ -431,7 +434,8 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - unsigned int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); @@ -481,7 +485,7 @@ restart: */ if (iocb->ki_pos > i_size_read(inode)) { error = xfs_file_write_zero_eof(iocb, from, iolock, count, - &drained_dio); + &drained_dio, ac); if (error == 1) goto restart; if (error) @@ -491,6 +495,48 @@ restart: return kiocb_modified(iocb); } +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + + /* + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. + */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(ip, + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -503,6 +549,9 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + trace_xfs_end_io_direct_write(ip, offset, size); if (xfs_is_shutdown(ip->i_mount)) @@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; + } + ac->reserved_blocks -= count_fsb; + + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} + +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Handle block aligned direct I/O writes + * Handle block aligned direct I/O writes. */ static noinline ssize_t xfs_file_dio_write_aligned( struct xfs_inode *ip, struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; @@ -597,7 +683,7 @@ xfs_file_dio_write_aligned( ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, ac); if (ret) goto out_unlock; @@ -611,11 +697,31 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, NULL, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); out_unlock: - if (iolock) - xfs_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip, &ac); return ret; } @@ -675,7 +781,7 @@ retry_exclusive: goto out_unlock; } - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out_unlock; @@ -721,9 +827,21 @@ xfs_file_dio_write( /* direct I/O must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + + /* + * For always COW inodes we also must check the alignment of each + * individual iovec segment, as they could end up with different + * I/Os due to the way bio_iov_iter_get_pages works, and we'd + * then overwrite an already written block. + */ + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || + (xfs_is_always_cow_inode(ip) && + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) return xfs_file_dio_write_unaligned(ip, iocb, from); - return xfs_file_dio_write_aligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } static noinline ssize_t @@ -740,7 +858,7 @@ xfs_file_dax_write( ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -784,7 +902,7 @@ write_retry: if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -832,6 +950,67 @@ out: } STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) @@ -878,6 +1057,8 @@ xfs_file_write_iter( return ret; } + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); return xfs_file_buffered_write(iocb, from); } @@ -932,7 +1113,8 @@ static int xfs_falloc_collapse_range( struct file *file, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); loff_t new_size = i_size_read(inode) - len; @@ -948,7 +1130,7 @@ xfs_falloc_collapse_range( if (offset + len >= i_size_read(inode)) return -EINVAL; - error = xfs_collapse_file_space(XFS_I(inode), offset, len); + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -1004,7 +1186,8 @@ xfs_falloc_zero_range( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); unsigned int blksize = i_blocksize(inode); @@ -1017,7 +1200,7 @@ xfs_falloc_zero_range( if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len); + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); if (error) return error; @@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range( FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long -xfs_file_fallocate( +__xfs_file_fallocate( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) @@ -1124,16 +1303,16 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); break; case FALLOC_FL_COLLAPSE_RANGE: - error = xfs_falloc_collapse_range(file, offset, len); + error = xfs_falloc_collapse_range(file, offset, len, ac); break; case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: - error = xfs_falloc_zero_range(file, mode, offset, len); + error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; case FALLOC_FL_UNSHARE_RANGE: error = xfs_falloc_unshare_range(file, mode, offset, len); @@ -1154,6 +1333,54 @@ out_unlock: return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int xfs_file_fadvise( struct file *file, @@ -1347,15 +1574,22 @@ xfs_file_release( * blocks. This avoids open/read/close workloads from removing EOF * blocks that other writers depend upon to reduce fragmentation. * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* * If we can't get the iolock just skip truncating the blocks past EOF * because we could deadlock with the mmap_lock otherwise. We'll get * another chance to drop them once the last reference to the inode is * dropped, so we'll never leak blocks permanently. */ - if (inode->i_nlink && - (file->f_mode & FMODE_WRITE) && - !(ip->i_diflags & XFS_DIFLAG_APPEND) && - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (xfs_can_free_eofblocks(ip) && !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) @@ -1469,9 +1703,10 @@ xfs_dax_read_fault( * i_lock (XFS - extent map serialisation) */ static vm_fault_t -xfs_write_fault( +__xfs_write_fault( struct vm_fault *vmf, - unsigned int order) + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1499,13 +1734,49 @@ xfs_write_fault( ret = xfs_dax_fault_locked(vmf, order, true); else ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, - NULL); + ac); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; + + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, + &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} + static inline bool xfs_is_write_fault( struct vm_fault *vmf) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 1dbd2d75f7ae..a4bc1642fe56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt( struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; xfs_rtblock_t start_rtb; xfs_rtblock_t end_rtb; xfs_rgnumber_t start_rg, end_rg; uint64_t eofs; int error = 0; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (keys[0].fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; + return 0; + } + + /* Fabricate an rmap entry for space occupied by the data dev */ + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + } + + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + + min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; @@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt( } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -1126,7 +1166,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -1134,13 +1174,17 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); if (use_rmap) handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; else @@ -1230,7 +1274,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 455298503d01..0ada73569394 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -110,7 +111,7 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) return error; xfs_buf_relse(bp); @@ -300,24 +301,30 @@ xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { - int error = 0; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; + /* we can't grow the data section when an internal RT section exists */ + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { + error = -EINVAL; + goto out_unlock; + } + /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) - goto out_error; + goto out_unlock; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) - goto out_error; + goto out_unlock; } /* Post growfs calculations needed to reflect new state in operations */ @@ -331,13 +338,12 @@ xfs_growfs_data( /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); -out_error: /* - * Increment the generation unconditionally, the error could be from - * updating the secondary superblocks, in which case the new size - * is live already. + * Increment the generation unconditionally, after trying to update the + * secondary superblocks, as the new size is live already at this point. */ mp->m_generation++; +out_unlock: mutex_unlock(&mp->m_growlock); return error; } @@ -366,6 +372,7 @@ xfs_growfs_log( int xfs_reserve_blocks( struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; @@ -373,6 +380,8 @@ xfs_reserve_blocks( int64_t free; int error = 0; + ASSERT(ctr < XC_FREE_NR); + /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -391,16 +400,16 @@ xfs_reserve_blocks( * counters directly since we shouldn't have any problems unreserving * space. */ - if (mp->m_resblks > request) { - lcounter = mp->m_resblks_avail - request; + if (mp->m_free[ctr].res_total > request) { + lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; - mp->m_resblks_avail -= lcounter; + mp->m_free[ctr].res_avail -= lcounter; } - mp->m_resblks = request; + mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -409,7 +418,7 @@ xfs_reserve_blocks( /* * If the request is larger than the current reservation, reserve the - * blocks before we update the reserve counters. Sample m_fdblocks and + * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from @@ -419,10 +428,10 @@ xfs_reserve_blocks( * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ - free = percpu_counter_sum(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - delta = request - mp->m_resblks; - mp->m_resblks = request; + free = xfs_sum_freecounter_raw(mp, ctr) - + xfs_freecounter_unavailable(mp, ctr); + delta = request - mp->m_free[ctr].res_total; + mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -436,9 +445,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks( return error; } - if (xfs_has_realtime(mp)) { - err2 = xfs_rt_resv_init(mp); - if (err2 && err2 != -ENOSPC) { - xfs_warn(mp, - "Error %d reserving realtime metadata reserve pool.", err2); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (err2 && !error) + if (!error) error = err2; } @@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks( { struct xfs_perag *pag = NULL; - if (xfs_has_realtime(mp)) - xfs_rt_resv_free(mp); - + xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 3e2f73bcf831..9d23c361ef56 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,8 @@ int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, + uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7b6c026d01a1..2f53ca7e12d4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_REALTIME_INODE(ip)) + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) return false; - if (__percpu_counter_compare(&mp->m_frextents, + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; @@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true; - if (__percpu_counter_compare(&mp->m_fdblocks, + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, mp->m_low_space[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b1f9f156ec88..ce6b8ffbaa2c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1721,8 +1721,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED, &bp); + mp->m_bsize * igeo->blocks_per_cluster, 0, &bp); if (error) return error; @@ -3074,5 +3073,6 @@ bool xfs_is_always_cow_inode( const struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); + return xfs_is_zoned_inode(ip) || + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352..4bb7a99e0dc4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -25,19 +25,9 @@ struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - union { - struct { - struct xfs_dquot *i_udquot; /* user dquot */ - struct xfs_dquot *i_gdquot; /* group dquot */ - struct xfs_dquot *i_pdquot; /* project dquot */ - }; - - /* - * Space that has been set aside to accomodate expansions of a - * metadata btree rooted in this file. - */ - uint64_t i_meta_resv_asked; - }; + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ @@ -69,8 +59,13 @@ typedef struct xfs_inode { xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + /* + * i_used_blocks is used for zoned rtrmap inodes, + * i_cowextsize is used for other v3 inodes, + * i_flushiter for v1/2 inodes + */ union { + uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; @@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) +{ + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); +} + bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35803fcf0beb..40fc1bf900af 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -596,6 +596,7 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index f3bfb814378c..7205fd14f6b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -203,6 +203,7 @@ xfs_log_dinode_to_disk( to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ed85322507dd..d250f7f74e3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_reserve_blocks(mp, fsop.resblks); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); - fsop.resblks = mp->m_resblks; - fsop.resblks_avail = mp->m_resblks_avail; + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) @@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts( struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp), - .freertx = percpu_counter_read_positive(&mp->m_frextents), + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f631177ac320..cb23c8871f81 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,6 +30,8 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -431,13 +433,14 @@ xfs_quota_calc_throttle( static int64_t xfs_iomap_freesp( - struct percpu_counter *counter, + struct xfs_mount *mp, + unsigned int idx, uint64_t low_space[XFS_LOWSP_MAX], int *shift) { int64_t freesp; - freesp = percpu_counter_read_positive(counter); + freesp = xfs_estimate_freecounter(mp, idx); if (freesp < low_space[XFS_LOWSP_5_PCNT]) { *shift = 2; if (freesp < low_space[XFS_LOWSP_4_PCNT]) @@ -536,10 +539,10 @@ xfs_iomap_prealloc_size( if (unlikely(XFS_IS_REALTIME_INODE(ip))) freesp = xfs_rtbxlen_to_blen(mp, - xfs_iomap_freesp(&mp->m_frextents, + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts, &shift)); else - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, &shift); /* @@ -966,6 +969,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +#ifdef CONFIG_XFS_RT +/* + * This is really simple. The space has already been reserved before taking the + * IOLOCK, the actual block allocation is done just before submitting the bio + * and only recorded in the extent map on I/O completion. + */ +static int +xfs_zoned_direct_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); + + /* + * Needs to be pushed down into the allocator so that only writes into + * a single zone can be supported. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + /* + * Ensure the extent list is in memory in so that we don't have to do + * read it from the I/O completion handler. + */ + if (xfs_need_iread_extents(&ip->i_df)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + } + + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DIRTY; + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; + iomap->offset = offset; + iomap->length = length; + iomap->flags = IOMAP_F_ANON_WRITE; + return 0; +} + +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { + .iomap_begin = xfs_zoned_direct_write_iomap_begin, +}; +#endif /* CONFIG_XFS_RT */ + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -991,6 +1047,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_end = xfs_dax_write_iomap_end, }; +/* + * Convert a hole to a delayed allocation. + */ +static void +xfs_bmap_add_extent_hole_delay( + struct xfs_inode *ip, /* incore inode pointer */ + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *new) /* new data to add to file extents */ +{ + struct xfs_ifork *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + uint32_t state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ + + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { + state |= BMAP_LEFT_VALID; + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (xfs_iext_get_extent(ifp, icur, &right)) { + state |= BMAP_RIGHT_VALID; + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; + + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, icur, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + + /* + * Nothing to do for disk quota accounting here. + */ + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); + } +} + +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ +static int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *icur, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_extlen_t alen; + xfs_extlen_t indlen; + uint64_t fdblocks; + int error; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; + +retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) + goto out; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + ASSERT(!xfs_is_zoned_inode(ip)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } + + error = xfs_dec_fdblocks(mp, fdblocks, false); + if (error) + goto out_unreserve_frextents; + + ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip, alen, indlen); + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + + return 0; + +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } + return error; +} + +static int +xfs_zoned_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + struct xfs_zone_alloc_ctx *ac = iter->private; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + u16 iomap_flags = IOMAP_F_SHARED; + unsigned int lockmode = XFS_ILOCK_EXCL; + xfs_filblks_t count_fsb; + xfs_extlen_t indlen; + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + int error = 0; + + ASSERT(!xfs_get_extsz_hint(ip)); + ASSERT(!(flags & IOMAP_UNSHARE)); + ASSERT(ac); + + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip); + if (error) + return error; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* + * For zeroing operations check if there is any data to zero first. + * + * For regular writes we always need to allocate new blocks, but need to + * provide the source mapping when the range is unaligned to support + * read-modify-write of the whole block in the page cache. + * + * In either case we need to limit the reported range to the boundaries + * of the source map in the data fork. + */ + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || + (flags & IOMAP_ZERO)) { + struct xfs_bmbt_irec smap; + struct xfs_iext_cursor scur; + + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, + &smap)) + smap.br_startoff = end_fsb; /* fake hole until EOF */ + if (smap.br_startoff > offset_fsb) { + /* + * We never need to allocate blocks for zeroing a hole. + */ + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, + smap.br_startoff); + goto out_unlock; + } + end_fsb = min(end_fsb, smap.br_startoff); + } else { + end_fsb = min(end_fsb, + smap.br_startoff + smap.br_blockcount); + xfs_trim_extent(&smap, offset_fsb, + end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); + if (error) + goto out_unlock; + } + } + + if (!ip->i_cowfp) + xfs_ifork_init_cow(ip); + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = end_fsb; + if (got.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* + * Cap the maximum length to keep the chunks of work done here somewhat + * symmetric with the work writeback does. + */ + end_fsb = min(end_fsb, got.br_startoff); + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + + /* + * The block reservation is supposed to cover all blocks that the + * operation could possible write, but there is a nasty corner case + * where blocks could be stolen from underneath us: + * + * 1) while this thread iterates over a larger buffered write, + * 2) another thread is causing a write fault that calls into + * ->page_mkwrite in range this thread writes to, using up the + * delalloc reservation created by a previous call to this function. + * 3) another thread does direct I/O on the range that the write fault + * happened on, which causes writeback of the dirty data. + * 4) this then set the stale flag, which cuts the current iomap + * iteration short, causing the new call to ->iomap_begin that gets + * us here again, but now without a sufficient reservation. + * + * This is a very unusual I/O pattern, and nothing but generic/095 is + * known to hit it. There's not really much we can do here, so turn this + * into a short write. + */ + if (count_fsb > ac->reserved_blocks) { + xfs_warn_ratelimited(mp, +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", + ip->i_ino, current->comm); + count_fsb = ac->reserved_blocks; + if (!count_fsb) { + error = -EIO; + goto out_unlock; + } + } + + error = xfs_quota_reserve_blkres(ip, count_fsb); + if (error) + goto out_unlock; + + indlen = xfs_bmap_worst_indlen(ip, count_fsb); + error = xfs_dec_fdblocks(mp, indlen, false); + if (error) + goto out_unlock; + ip->i_delayed_blks += count_fsb; + xfs_mod_delalloc(ip, count_fsb, indlen); + + got.br_startoff = offset_fsb; + got.br_startblock = nullstartblock(indlen); + got.br_blockcount = count_fsb; + got.br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); + ac->reserved_blocks -= count_fsb; + iomap_flags |= IOMAP_F_NEW; + + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), + XFS_COW_FORK, &got); +done: + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + static int xfs_buffered_write_iomap_begin( struct inode *inode, @@ -1017,6 +1522,10 @@ xfs_buffered_write_iomap_begin( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_is_zoned_inode(ip)) + return xfs_zoned_buffered_write_iomap_begin(inode, offset, + count, flags, iomap, srcmap); + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, @@ -1249,10 +1758,13 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + xfs_bmap_punch_delalloc_range(XFS_I(inode), (iomap->flags & IOMAP_F_SHARED) ? XFS_COW_FORK : XFS_DATA_FORK, - offset, offset + length); + offset, offset + length, iter->private); } static int @@ -1489,6 +2001,7 @@ xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1499,13 +2012,14 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops, NULL); + &xfs_buffered_write_iomap_ops, ac); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1514,5 +2028,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops, NULL); + &xfs_buffered_write_iomap_ops, ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8347268af727..d330c4a581b1 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -10,6 +10,7 @@ struct xfs_inode; struct xfs_bmbt_irec; +struct xfs_zone_alloc_ctx; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, - bool *did_zero); -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); + struct xfs_zone_alloc_ctx *ac, bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -49,6 +51,7 @@ xfs_aligned_fsb_count( extern const struct iomap_ops xfs_buffered_write_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a4480098d2bf..756bd3ca8e00 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -854,6 +855,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -890,6 +892,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -902,11 +926,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f8851ff835de..6493bdb57351 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -3540,6 +3541,9 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 6ed485ff2756..15d410d16bb2 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -173,6 +173,10 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", }, + [XFS_EXPERIMENTAL_ZONED] = { + .opstate = XFS_OPSTATE_WARNED_ZONED, + .name = "zoned RT device", + }, }; ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb36ced9df7..a92a4d09c8e9 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -99,6 +99,7 @@ enum xfs_experimental_feat { XFS_EXPERIMENTAL_EXCHRANGE, XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, + XFS_EXPERIMENTAL_ZONED, XFS_EXPERIMENTAL_MAX, }; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b69356582b86..00b53f479ece 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -40,6 +40,7 @@ #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" +#include "xfs_zone_alloc.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -185,7 +186,7 @@ xfs_readsb( */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -413,7 +414,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "last sector read failed"); return error; @@ -430,7 +431,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSB_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "log device read failed"); return error; @@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } +static const char *const xfs_free_pool_name[] = { + [XC_FREE_BLOCKS] = "free blocks", + [XC_FREE_RTEXTENTS] = "free rt extents", + [XC_FREE_RTAVAILABLE] = "available rt extents", +}; + uint64_t -xfs_default_resblks(xfs_mount_t *mp) +xfs_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) { - uint64_t resblks; - - /* - * We default to 5% or 8192 fsbs of space reserved, whichever is - * smaller. This is intended to cover concurrent allocation - * transactions when we initially hit enospc. These each require a 4 - * block reservation. Hence by default we cover roughly 2000 concurrent - * allocation reservations. - */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(uint64_t, resblks, 8192); - return resblks; + switch (ctr) { + case XC_FREE_BLOCKS: + /* + * Default to 5% or 8192 FSBs of space reserved, whichever is + * smaller. + * + * This is intended to cover concurrent allocation transactions + * when we initially hit ENOSPC. These each require a 4 block + * reservation. Hence by default we cover roughly 2000 + * concurrent allocation reservations. + */ + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); + case XC_FREE_RTEXTENTS: + case XC_FREE_RTAVAILABLE: + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + return xfs_zoned_default_resblks(mp, ctr); + return 0; + default: + ASSERT(0); + return 0; + } } /* Ensure the summary counts are correct. */ @@ -543,7 +560,7 @@ xfs_check_summary_counts( * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; @@ -678,6 +695,7 @@ xfs_mountfs( uint quotamount = 0; uint quotaflags = 0; int error = 0; + int i; xfs_sb_mount_common(mp, sbp); @@ -747,27 +765,15 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - super_set_sysfs_name_id(mp->m_super); - - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, - NULL, mp->m_super->s_id); - if (error) - goto out; - - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, - &mp->m_kobj, "stats"); + error = xfs_mount_sysfs_init(mp); if (error) - goto out_remove_sysfs; + goto out_remove_scrub_stats; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); - error = xfs_error_sysfs_init(mp); - if (error) - goto out_remove_scrub_stats; - error = xfs_errortag_init(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_sysfs; error = xfs_uuid_mount(mp); if (error) @@ -1031,6 +1037,12 @@ xfs_mountfs( if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); + if (xfs_has_zoned(mp)) { + error = xfs_mount_zones(mp); + if (error) + goto out_rtunmount; + } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -1046,22 +1058,28 @@ xfs_mountfs( * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations - * are not allowed to use this reserved space. + * attr, unwritten extent conversion at ENOSPC, garbage collection + * etc. Data allocations are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); - if (error) - xfs_warn(mp, - "Unable to allocate reserve blocks. Continuing without reserve pool."); + for (i = 0; i < XC_FREE_NR; i++) { + error = xfs_reserve_blocks(mp, i, + xfs_default_resblks(mp, i)); + if (error) + xfs_warn(mp, +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.", + xfs_free_pool_name[i]); + } /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; + + xfs_zone_gc_start(mp); } return 0; @@ -1069,6 +1087,8 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1116,13 +1136,10 @@ xfs_mountfs( xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); - out_remove_error_sysfs: - xfs_error_sysfs_del(mp); + out_remove_sysfs: + xfs_mount_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - out_remove_sysfs: - xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1148,8 +1165,12 @@ xfs_unmountfs( xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) + xfs_zone_gc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); if (mp->m_metadirip) @@ -1173,7 +1194,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - error = xfs_reserve_blocks(mp, 0); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); @@ -1195,10 +1216,8 @@ xfs_unmountfs( xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); - xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - xfs_sysfs_del(&mp->m_kobj); + xfs_mount_sysfs_del(mp); } /* @@ -1220,52 +1239,67 @@ xfs_fs_writable( return true; } +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +uint64_t +xfs_freecounter_unavailable( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + if (ctr != XC_FREE_BLOCKS) + return 0; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + void xfs_add_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta) { - bool has_resv_pool = (counter == &mp->m_fdblocks); + struct xfs_freecounter *counter = &mp->m_free[ctr]; uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { - percpu_counter_add(counter, delta); + if (likely(counter->res_avail == counter->res_total)) { + percpu_counter_add(&counter->count, delta); return; } spin_lock(&mp->m_sb_lock); - res_used = mp->m_resblks - mp->m_resblks_avail; + res_used = counter->res_total - counter->res_avail; if (res_used > delta) { - mp->m_resblks_avail += delta; + counter->res_avail += delta; } else { delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); + counter->res_avail = counter->res_total; + percpu_counter_add(&counter->count, delta); } spin_unlock(&mp->m_sb_lock); } + +/* Adjust in-core free blocks or RT extents. */ int xfs_dec_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta, bool rsvd) { - int64_t lcounter; - uint64_t set_aside = 0; + struct xfs_freecounter *counter = &mp->m_free[ctr]; s32 batch; - bool has_resv_pool; - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); - has_resv_pool = (counter == &mp->m_fdblocks); - if (rsvd) - ASSERT(has_resv_pool); + ASSERT(ctr < XC_FREE_NR); /* * Taking blocks away, need to be more accurate the closer we @@ -1275,7 +1309,7 @@ xfs_dec_freecounter( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1292,34 +1326,34 @@ xfs_dec_freecounter( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - if (has_resv_pool) - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, -((int64_t)delta), batch); - if (__percpu_counter_compare(counter, set_aside, - XFS_FDBLOCKS_BATCH) >= 0) { - /* we had space! */ - return 0; - } - - /* - * lock up the sb for dipping into reserves before releasing the space - * that took us to ENOSPC. - */ - spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, delta); - if (!has_resv_pool || !rsvd) - goto fdblocks_enospc; - - lcounter = (long long)mp->m_resblks_avail - delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); + if (__percpu_counter_compare(&counter->count, + xfs_freecounter_unavailable(mp, ctr), + XFS_FDBLOCKS_BATCH) < 0) { + /* + * Lock up the sb for dipping into reserves before releasing the + * space that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&counter->count, delta); + if (!rsvd) + goto fdblocks_enospc; + if (delta > counter->res_avail) { + if (ctr == XC_FREE_BLOCKS) + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + goto fdblocks_enospc; + } + counter->res_avail -= delta; + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); - return 0; } - xfs_warn_once(mp, -"Reserve blocks depleted! Consider increasing reserve pool size."); + + /* we had space! */ + return 0; fdblocks_enospc: + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); return -ENOSPC; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fbed172d6770..799b84220ebb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -98,11 +98,41 @@ struct xfs_groups { uint8_t blklog; /* + * Zoned devices can have gaps beyond the usable capacity of a zone and + * the end in the LBA/daddr address space. In other words, the hardware + * equivalent to the RT groups already takes care of the power of 2 + * alignment for us. In this case the sparse FSB/RTB address space maps + * 1:1 to the device address space. + */ + bool has_daddr_gaps; + + /* * Mask to extract the group-relative block number from a FSB. * For a pre-rtgroups filesystem we pretend to have one very large * rtgroup, so this mask must be 64-bit. */ uint64_t blkmask; + + /* + * Start of the first group in the device. This is used to support a + * RT device following the data device on the same block device for + * SMR hard drives. + */ + xfs_fsblock_t start_fsb; +}; + +struct xfs_freecounter { + /* free blocks for general use: */ + struct percpu_counter count; + + /* total reserved blocks: */ + uint64_t res_total; + + /* available reserved blocks: */ + uint64_t res_avail; + + /* reserved blks @ remount,ro: */ + uint64_t res_saved; }; /* @@ -198,6 +228,7 @@ typedef struct xfs_mount { bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + unsigned int m_max_open_zones; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -222,8 +253,8 @@ typedef struct xfs_mount { spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - struct percpu_counter m_frextents; /* free rt extent counter */ + + struct xfs_freecounter m_free[XC_FREE_NR]; /* * Count of data device blocks reserved for delayed allocations, @@ -245,10 +276,8 @@ typedef struct xfs_mount { atomic64_t m_allocbt_blks; struct xfs_groups m_groups[XG_TYPE_MAX]; - uint64_t m_resblks; /* total reserved blocks */ - uint64_t m_resblks_avail;/* available reserved blocks */ - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct xfs_zone_info *m_zone_info; /* zone allocator information */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; @@ -258,10 +287,16 @@ typedef struct xfs_mount { #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif + struct xfs_kobj m_zoned_kobj; xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; + /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* @@ -336,8 +371,10 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ /* Mount features */ +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) __XFS_HAS_FEAT(metadir, METADIR) +__XFS_HAS_FEAT(zoned, ZONED) +__XFS_HAS_FEAT(nolifetime, NOLIFETIME) static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) { @@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) static inline bool xfs_has_rtsb(const struct xfs_mount *mp) { /* all rtgroups filesystems with an rt section have an rtsb */ - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); + return xfs_has_rtgroups(mp) && + xfs_has_realtime(mp) && + !xfs_has_zoned(mp); } static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) @@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) xfs_has_reflink(mp); } +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) +{ + return !xfs_has_zoned(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. @@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ #define XFS_OPSTATE_RESUMING_QUOTAON 18 +/* Kernel has logged a warning about zoned RT device being used on this fs. */ +#define XFS_OPSTATE_WARNED_ZONED 19 +/* (Zoned) GC is in progress */ +#define XFS_OPSTATE_ZONEGC_RUNNING 20 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) #endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) +__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } extern void xfs_uuid_table_free(void); -extern uint64_t xfs_default_resblks(xfs_mount_t *mp); +uint64_t xfs_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, + enum xfs_free_counter ctr); + /* - * Estimate the amount of free space that is not available to userspace and is - * not explicitly reserved from the incore fdblocks. This includes: - * - * - The minimum number of blocks needed to support splitting a bmap btree - * - The blocks currently in use by the freespace btrees because they record - * the actual blocks that will fill per-AG metadata space reservations + * Sum up the freecount, but never return negative values. */ -static inline uint64_t -xfs_fdblocks_unavailable( - struct xfs_mount *mp) +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) { - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + return percpu_counter_sum_positive(&mp->m_free[ctr].count); } -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +/* + * Same as above, but does return negative values. Mostly useful for + * special cases like repair and tracing. + */ +static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum(&mp->m_free[ctr].count); +} + +/* + * This just provides and estimate without the cpu-local updates, use + * xfs_sum_freecounter for the exact value. + */ +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_read_positive(&mp->m_free[ctr].count); +} + +static inline int xfs_compare_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, s64 rhs, s32 batch) +{ + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); +} + +static inline void xfs_set_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t val) +{ + percpu_counter_set(&mp->m_free[ctr].count, val); +} + +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta, bool rsvd); -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_frextents, delta); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e1ba5af6250f..417439b58785 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas( * immediately. We only support rtquota if rtgroups are enabled to * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { + if (mp->m_sb.sb_rextents && + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59f7fc16eb80..cc3b4df88110 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks( if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, - &del); + &del, 0); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_rtgroup *rtg; - xfs_rgnumber_t rgno; - - rgno = xfs_rtb_to_rgno(mp, fsb); - rtg = xfs_rtgroup_get(mp, rgno); - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) - error = -ENOSPC; - xfs_rtgroup_put(rtg); - return error; + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; } agno = XFS_FSB_TO_AGNO(mp, fsb); @@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return xfs_zero_range(ip, isize, pos - isize, NULL); + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); } /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 57bef567e011..6484c596ecea 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -33,6 +33,7 @@ #include "xfs_trace.h" #include "xfs_rtrefcount_btree.h" #include "xfs_reflink.h" +#include "xfs_zone_alloc.h" /* * Return whether there are any free extents in the size range given @@ -663,7 +664,8 @@ xfs_rtunmount_rtg( for (i = 0; i < XFS_RTGI_MAX; i++) xfs_rtginode_irele(&rtg->rtg_inodes[i]); - kvfree(rtg->rtg_rsum_cache); + if (!xfs_has_zoned(rtg_mount(rtg))) + kvfree(rtg->rtg_rsum_cache); } static int @@ -837,7 +839,7 @@ xfs_growfs_rt_init_rtsb( return 0; error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), - 0, &rtsb_bp); + &rtsb_bp); if (error) return error; @@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb( return error; } +static void +xfs_growfs_rt_sb_fields( + struct xfs_trans *tp, + const struct xfs_mount *nmp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_mount *nmp; + struct xfs_trans *tp; + xfs_rtbxlen_t freed_rtx; + int error; + + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, + mp->m_sb.sb_rextsize); + if (!nmp) + return -ENOMEM; + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); + if (error) + goto out_free; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + + xfs_growfs_rt_sb_fields(tp, nmp); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. + */ + mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); + xfs_zoned_add_available(mp, freed_rtx); +out_free: + kfree(nmp); + return error; +} + static int xfs_growfs_rt_bmblock( struct xfs_rtgroup *rtg, @@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock( /* * Update superblock fields. */ - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); + xfs_growfs_rt_sb_fields(args.tp, nmp); /* * Free the new extent. @@ -1127,6 +1190,11 @@ xfs_growfs_rtg( goto out_rele; } + if (xfs_has_zoned(mp)) { + error = xfs_growfs_rt_zoned(rtg, nrblocks); + goto out_rele; + } + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); if (error) goto out_rele; @@ -1144,10 +1212,8 @@ xfs_growfs_rtg( goto out_error; } - if (old_rsum_cache) - kvfree(old_rsum_cache); - xfs_rtgroup_rele(rtg); - return 0; + kvfree(old_rsum_cache); + goto out_rele; out_error: /* @@ -1195,6 +1261,22 @@ xfs_growfs_check_rtgeom( if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; + + if (xfs_has_zoned(mp)) { + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; + uint32_t rem; + + if (rextsize != 1) + return -EINVAL; + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + if (rem) { + xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", + mp->m_sb.sb_rblocks, gblocks); + return -EINVAL; + } + } + return 0; } @@ -1249,6 +1331,35 @@ xfs_grow_last_rtg( } /* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( + struct xfs_mount *mp, + xfs_rfsblock_t last_block) +{ + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); + struct xfs_buf *bp; + int error; + + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { + xfs_warn(mp, "RT device size overflow: %llu != %llu", + XFS_BB_TO_FSB(mp, daddr), last_block); + return -EFBIG; + } + + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, + XFS_FSB_TO_BB(mp, 1), &bp, NULL); + if (error) + xfs_warn(mp, "cannot read last RT device sector (%lld)", + last_block); + else + xfs_buf_relse(bp); + return error; +} + +/* * Grow the realtime area of the filesystem. */ int @@ -1259,7 +1370,6 @@ xfs_growfs_rt( xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; xfs_rgnumber_t new_rgcount = 1; xfs_rgnumber_t rgno; - struct xfs_buf *bp; xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; int error; @@ -1302,15 +1412,10 @@ xfs_growfs_rt( error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) goto out_unlock; - /* - * Read in the last block of the device, make sure it exists. - */ - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + error = xfs_rt_check_size(mp, in->newblocks - 1); if (error) goto out_unlock; - xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. @@ -1376,8 +1481,7 @@ xfs_growfs_rt( error = error2; /* Reset the rt metadata btree space reservations. */ - xfs_rt_resv_free(mp); - error2 = xfs_rt_resv_init(mp); + error2 = xfs_metafile_resv_init(mp); if (error2 && error2 != -ENOSPC) error = error2; } @@ -1407,7 +1511,7 @@ xfs_rtmount_readsb( /* m_blkbb_log is not set up yet */ error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, - mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp, + mp->m_sb.sb_blocksize >> BBSHIFT, &bp, &xfs_rtsb_buf_ops); if (error) { xfs_warn(mp, "rt sb validate failed with error %d.", error); @@ -1444,10 +1548,6 @@ int /* error */ xfs_rtmount_init( struct xfs_mount *mp) /* file system mount structure */ { - struct xfs_buf *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - int error; - if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { @@ -1458,25 +1558,7 @@ xfs_rtmount_init( mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); - /* - * Check that the realtime section is an ok size. - */ - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - xfs_warn(mp, "realtime mount -- %llu != %llu", - (unsigned long long) XFS_BB_TO_FSB(mp, d), - (unsigned long long) mp->m_sb.sb_rblocks); - return -EFBIG; - } - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); - if (error) { - xfs_warn(mp, "realtime device size check failed"); - return error; - } - xfs_buf_relse(bp); - return 0; + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); } static int @@ -1519,50 +1601,10 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); return 0; } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - unsigned int i; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - for (i = 0; i < XFS_RTGI_MAX; i++) - xfs_metafile_resv_free(rtg->rtg_inodes[i]); - } -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - xfs_filblks_t ask; - int error = 0; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - int err2; - - ask = xfs_rtrmapbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); - if (err2 && !error) - error = err2; - - ask = xfs_rtrefcountbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); - if (err2 && !error) - error = err2; - } - - return error; -} - /* * Read in the bmbt of an rt metadata inode so that we never have to load them * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use @@ -1613,6 +1655,8 @@ xfs_rtmount_rtg( } } + if (xfs_has_zoned(mp)) + return 0; return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); } @@ -2097,6 +2141,8 @@ xfs_bmap_rtalloc( ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; int error; + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); + retry: error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); if (error) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0d95b29092c9..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -34,9 +34,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -void xfs_rt_resv_free(struct xfs_mount *mp); -int xfs_rt_resv_init(struct xfs_mount *mp); - /* * Grow the realtime area of the filesystem. */ @@ -65,8 +62,6 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) -# define xfs_rt_resv_free(mp) ((void)0) -# define xfs_rt_resv_init(mp) (0) static inline int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 62d04f4843cf..53944cc7af24 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_exchmaps_item.h" #include "xfs_parent.h" #include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -109,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), {} }; @@ -182,6 +187,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -233,6 +239,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + return 0; } @@ -533,7 +542,15 @@ xfs_setup_devices( if (error) return error; } - if (mp->m_rtdev_targp) { + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) @@ -757,7 +774,7 @@ xfs_mount_free( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -814,6 +831,7 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; @@ -834,10 +852,12 @@ xfs_statfs_data( struct kstatfs *st) { int64_t fdblocks = - percpu_counter_sum(&mp->m_fdblocks); + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); /* make sure st->f_bfree does not underflow */ - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + /* * sb_dblocks can change during growfs, but nothing cares about reporting * the old or new value during growfs. @@ -856,8 +876,9 @@ xfs_statfs_rt( struct kstatfs *st) { st->f_bfree = xfs_rtbxlen_to_blen(mp, - percpu_counter_sum_positive(&mp->m_frextents)); - st->f_blocks = mp->m_sb.sb_rblocks; + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } static void @@ -922,24 +943,32 @@ xfs_fs_statfs( } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -976,6 +1005,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -997,6 +1027,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1058,6 +1089,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1065,7 +1109,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1075,30 +1120,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i > 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1210,6 +1258,18 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1436,6 +1497,15 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1780,8 +1850,17 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } - if (xfs_has_metadir(mp)) + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } if (xfs_has_reflink(mp)) { if (xfs_has_realtime(mp) && @@ -1793,6 +1872,13 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; @@ -1917,6 +2003,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1961,6 +2050,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -2082,6 +2174,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 60cb5318fdae..b0857e3c1270 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,6 +13,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; @@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -const struct kobj_type xfs_mp_ktype = { +static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -701,45 +702,103 @@ out_error: return error; } +static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) +{ + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); +} + +static ssize_t +max_open_zones_show( + struct kobject *kobj, + char *buf) +{ + /* only report the open zones available for user data */ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); +} +XFS_SYSFS_ATTR_RO(max_open_zones); + +static struct attribute *xfs_zoned_attrs[] = { + ATTR_LIST(max_open_zones), + NULL, +}; +ATTRIBUTE_GROUPS(xfs_zoned); + +static const struct kobj_type xfs_zoned_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_zoned_groups, +}; + int -xfs_error_sysfs_init( +xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; + super_set_sysfs_name_id(mp->m_super); + + /* .../xfs/<dev>/ */ + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); + if (error) + return error; + + /* .../xfs/<dev>/stats/ */ + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); + if (error) + goto out_remove_fsdir; + /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) - return error; + goto out_remove_stats_dir; + /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) - goto out_error; + goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) - goto out_error; + goto out_remove_error_dir; + + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { + /* .../xfs/<dev>/zoned/ */ + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, + &mp->m_kobj, "zoned"); + if (error) + goto out_remove_error_dir; + } return 0; -out_error: +out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); +out_remove_stats_dir: + xfs_sysfs_del(&mp->m_stats.xs_kobj); +out_remove_fsdir: + xfs_sysfs_del(&mp->m_kobj); return error; } void -xfs_error_sysfs_del( +xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + xfs_sysfs_del(&mp->m_zoned_kobj); + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; @@ -749,6 +808,8 @@ xfs_error_sysfs_del( } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); + xfs_sysfs_del(&mp->m_stats.xs_kobj); + xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 148893ebfdef..1622fe80ad3e 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,7 +7,6 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern const struct kobj_type xfs_dbg_ktype; /* debug */ extern const struct kobj_type xfs_log_ktype; /* xlog */ extern const struct kobj_type xfs_stats_ktype; /* stats */ @@ -53,7 +52,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } -int xfs_error_sysfs_init(struct xfs_mount *mp); -void xfs_error_sysfs_del(struct xfs_mount *mp); +int xfs_mount_sysfs_init(struct xfs_mount *mp); +void xfs_mount_sysfs_del(struct xfs_mount *mp); #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8f530e69c18a..a60556dbd172 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -49,6 +49,8 @@ #include "xfs_metafile.h" #include "xfs_metadir.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bfc2f1249022..e56ba1963160 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -102,6 +102,7 @@ struct xfs_rmap_intent; struct xfs_refcount_intent; struct xfs_metadir_update; struct xfs_rtgroup; +struct xfs_open_zone; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab); DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); DEFINE_GROUP_REF_EVENT(xfs_group_rele); +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xfs_zone_class, + TP_PROTO(struct xfs_rtgroup *rtg), + TP_ARGS(rtg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, nr_open) + ), + TP_fast_assign( + struct xfs_mount *mp = rtg_mount(rtg); + + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->nr_open) +); + +#define DEFINE_ZONE_EVENT(name) \ +DEFINE_EVENT(xfs_zone_class, name, \ + TP_PROTO(struct xfs_rtgroup *rtg), \ + TP_ARGS(rtg)) +DEFINE_ZONE_EVENT(xfs_zone_emptied); +DEFINE_ZONE_EVENT(xfs_zone_full); +DEFINE_ZONE_EVENT(xfs_zone_opened); +DEFINE_ZONE_EVENT(xfs_zone_reset); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); + +TRACE_EVENT(xfs_zone_free_blocks, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(rtg, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->rgbno, + __entry->len) +); + +DECLARE_EVENT_CLASS(xfs_zone_alloc_class, + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(oz, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, written) + __field(xfs_rgblock_t, write_pointer) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(oz->oz_rtg); + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->written = oz->oz_written; + __entry->write_pointer = oz->oz_write_pointer; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->written, + __entry->write_pointer, + __entry->rgbno, + __entry->len) +); + +#define DEFINE_ZONE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_zone_alloc_class, name, \ + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ + xfs_extlen_t len), \ + TP_ARGS(oz, rgbno, len)) +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); + +TRACE_EVENT(xfs_zone_gc_select_victim, + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), + TP_ARGS(rtg, bucket), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, bucket) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->bucket = bucket; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->bucket) +); + +TRACE_EVENT(xfs_zones_mount, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgcount) + __field(uint32_t, blocks) + __field(unsigned int, max_open_zones) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgcount = mp->m_sb.sb_rgcount; + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; + __entry->max_open_zones = mp->m_max_open_zones; + ), + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgcount, + __entry->blocks, + __entry->max_open_zones) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -545,6 +692,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); +DEFINE_BUF_EVENT(xfs_buf_backing_folio); +DEFINE_BUF_EVENT(xfs_buf_backing_kmem); +DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc); +DEFINE_BUF_EVENT(xfs_buf_backing_fallback); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); @@ -1596,6 +1747,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3983,6 +4135,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -5606,11 +5759,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); /* metadata inode space reservations */ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), - TP_ARGS(ip, len), + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_ino_t, ino) __field(unsigned long long, freeblks) __field(unsigned long long, reserved) __field(unsigned long long, asked) @@ -5618,19 +5770,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, __field(unsigned long long, len) ), TP_fast_assign( - struct xfs_mount *mp = ip->i_mount; - __entry->dev = mp->m_super->s_dev; - __entry->ino = ip->i_ino; - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); - __entry->reserved = ip->i_delayed_blks; - __entry->asked = ip->i_meta_resv_asked; - __entry->used = ip->i_nblocks; + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; __entry->len = len; ), - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->freeblks, __entry->reserved, __entry->asked, @@ -5639,14 +5787,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, ) #define DEFINE_METAFILE_RESV_EVENT(name) \ DEFINE_EVENT(xfs_metafile_resv_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ - TP_ARGS(ip, len)) + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); #ifdef CONFIG_XFS_RT TRACE_EVENT(xfs_growfs_check_rtgeom, @@ -5669,6 +5817,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom, ); #endif /* CONFIG_XFS_RT */ +TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); +TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); + +DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, + uint64_t delta, unsigned long caller_ip), + TP_ARGS(mp, ctr, delta, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_free_counter, ctr) + __field(uint64_t, delta) + __field(uint64_t, avail) + __field(uint64_t, total) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ctr = ctr; + __entry->delta = delta; + __entry->avail = mp->m_free[ctr].res_avail; + __entry->total = mp->m_free[ctr].res_total; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), + __entry->delta, + __entry->avail, + __entry->total, + (char *)__entry->caller_ip) +) +#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ + uint64_t delta, unsigned long caller_ip), \ + TP_ARGS(mp, ctr, delta, caller_ip)) +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c new file mode 100644 index 000000000000..52af234936a2 --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.c @@ -0,0 +1,1220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_error.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_iomap.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_refcount.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +void +xfs_open_zone_put( + struct xfs_open_zone *oz) +{ + if (atomic_dec_and_test(&oz->oz_ref)) { + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); + } +} + +static inline uint32_t +xfs_zone_bucket( + struct xfs_mount *mp, + uint32_t used_blocks) +{ + return XFS_ZONE_USED_BUCKETS * used_blocks / + mp->m_groups[XG_TYPE_RTG].blocks; +} + +static inline void +xfs_zone_add_to_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t to_bucket) +{ + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); + zi->zi_used_bucket_entries[to_bucket]++; +} + +static inline void +xfs_zone_remove_from_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t from_bucket) +{ + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); + zi->zi_used_bucket_entries[from_bucket]--; +} + +static void +xfs_zone_account_reclaimable( + struct xfs_rtgroup *rtg, + uint32_t freed) +{ + struct xfs_group *xg = &rtg->rtg_group; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgnumber_t rgno = rtg_rgno(rtg); + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); + uint32_t to_bucket = xfs_zone_bucket(mp, used); + bool was_full = (used + freed == rtg_blocks(rtg)); + + /* + * This can be called from log recovery, where the zone_info structure + * hasn't been allocated yet. Skip all work as xfs_mount_zones will + * add the zones to the right buckets before the file systems becomes + * active. + */ + if (!zi) + return; + + if (!used) { + /* + * The zone is now empty, remove it from the bottom bucket and + * trigger a reset. + */ + trace_xfs_zone_emptied(rtg); + + if (!was_full) + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); + + spin_lock(&zi->zi_used_buckets_lock); + if (!was_full) + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + spin_lock(&zi->zi_reset_list_lock); + xg->xg_next_reset = zi->zi_reset_list; + zi->zi_reset_list = xg; + spin_unlock(&zi->zi_reset_list_lock); + + if (zi->zi_gc_thread) + wake_up_process(zi->zi_gc_thread); + } else if (was_full) { + /* + * The zone transitioned from full, mark it up as reclaimable + * and wake up GC which might be waiting for zones to reclaim. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + } else if (to_bucket != from_bucket) { + /* + * Move the zone to a new bucket if it dropped below the + * threshold. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + } +} + +static void +xfs_open_zone_mark_full( + struct xfs_open_zone *oz) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + + trace_xfs_zone_full(rtg); + + WRITE_ONCE(rtg->rtg_open_zone, NULL); + + spin_lock(&zi->zi_open_zones_lock); + if (oz->oz_is_gc) { + ASSERT(current == zi->zi_gc_thread); + zi->zi_open_gc_zone = NULL; + } else { + zi->zi_nr_open_zones--; + list_del_init(&oz->oz_entry); + } + spin_unlock(&zi->zi_open_zones_lock); + xfs_open_zone_put(oz); + + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); +} + +static void +xfs_zone_record_blocks( + struct xfs_trans *tp, + xfs_fsblock_t fsbno, + xfs_filblks_t len, + struct xfs_open_zone *oz, + bool used) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + if (used) { + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + } else { + xfs_add_frextents(mp, len); + } + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); +} + +static int +xfs_zoned_map_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_bmbt_irec data; + int nmaps = 1; + int error; + + /* Grab the corresponding mapping in the data fork. */ + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, + &nmaps, 0); + if (error) + return error; + + /* + * Cap the update to the existing extent in the data fork because we can + * only overwrite one extent at a time. + */ + ASSERT(new->br_blockcount >= data.br_blockcount); + new->br_blockcount = data.br_blockcount; + + /* + * If a data write raced with this GC write, keep the existing data in + * the data fork, mark our newly written GC extent as reclaimable, then + * move on to the next extent. + */ + if (old_startblock != NULLFSBLOCK && + old_startblock != data.br_startblock) + goto skip; + + trace_xfs_reflink_cow_remap_from(ip, new); + trace_xfs_reflink_cow_remap_to(ip, &data); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + return error; + + if (data.br_startblock != HOLESTARTBLOCK) { + ASSERT(data.br_startblock != DELAYSTARTBLOCK); + ASSERT(!isnullstartblock(data.br_startblock)); + + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); + if (xfs_is_reflink_inode(ip)) { + xfs_refcount_decrease_extent(tp, true, &data); + } else { + error = xfs_free_extent_later(tp, data.br_startblock, + data.br_blockcount, NULL, + XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_REALTIME); + if (error) + return error; + } + } + + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + true); + + /* Map the new blocks into the data fork. */ + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); + return 0; + +skip: + trace_xfs_reflink_cow_remap_skip(ip, new); + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + false); + return 0; +} + +int +xfs_zoned_end_io( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count, + xfs_daddr_t daddr, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + struct xfs_bmbt_irec new = { + .br_startoff = XFS_B_TO_FSBT(mp, offset), + .br_startblock = xfs_daddr_to_rtb(mp, daddr), + .br_state = XFS_EXT_NORM, + }; + unsigned int resblks = + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + struct xfs_trans *tp; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + while (new.br_startoff < end_fsb) { + new.br_blockcount = end_fsb - new.br_startoff; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + new.br_startoff += new.br_blockcount; + new.br_startblock += new.br_blockcount; + if (old_startblock != NULLFSBLOCK) + old_startblock += new.br_blockcount; + } + + return 0; +} + +/* + * "Free" blocks allocated in a zone. + * + * Just decrement the used blocks counter and report the space as freed. + */ +int +xfs_zone_free_blocks( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); + + if (len > rmapip->i_used_blocks) { + xfs_err(mp, +"trying to free more blocks (%lld) than used counter (%u).", + len, rmapip->i_used_blocks); + ASSERT(len <= rmapip->i_used_blocks); + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; + } + + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); + + rmapip->i_used_blocks -= len; + /* + * Don't add open zones to the reclaimable buckets. The I/O completion + * for writing the last block will take care of accounting for already + * unused blocks instead. + */ + if (!READ_ONCE(rtg->rtg_open_zone)) + xfs_zone_account_reclaimable(rtg, len); + xfs_add_frextents(mp, len); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); + return 0; +} + +/* + * Check if the zone containing the data just before the offset we are + * writing to is still open and has space. + */ +static struct xfs_open_zone * +xfs_last_used_zone( + struct iomap_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); + struct xfs_rtgroup *rtg = NULL; + struct xfs_open_zone *oz = NULL; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, + &icur, &got)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return NULL; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); + if (!rtg) + return NULL; + + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + oz = READ_ONCE(rtg->rtg_open_zone); + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) + oz = NULL; + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + + xfs_rtgroup_rele(rtg); + return oz; +} + +static struct xfs_group * +xfs_find_free_zone( + struct xfs_mount *mp, + unsigned long start, + unsigned long end) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); + struct xfs_group *xg; + + xas_lock(&xas); + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) + if (atomic_inc_not_zero(&xg->xg_active_ref)) + goto found; + xas_unlock(&xas); + return NULL; + +found: + xas_clear_mark(&xas, XFS_RTG_FREE); + atomic_dec(&zi->zi_nr_free_zones); + zi->zi_free_zone_cursor = xg->xg_gno; + xas_unlock(&xas); + return xg; +} + +static struct xfs_open_zone * +xfs_init_open_zone( + struct xfs_rtgroup *rtg, + xfs_rgblock_t write_pointer, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_open_zone *oz; + + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); + spin_lock_init(&oz->oz_alloc_lock); + atomic_set(&oz->oz_ref, 1); + oz->oz_rtg = rtg; + oz->oz_write_pointer = write_pointer; + oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; + oz->oz_is_gc = is_gc; + + /* + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap + * inode, but we don't really want to take that here because we are + * under the zone_list_lock. Ensure the pointer is only set for a fully + * initialized open zone structure so that a racy lookup finding it is + * fine. + */ + WRITE_ONCE(rtg->rtg_open_zone, oz); + return oz; +} + +/* + * Find a completely free zone, open it, and return a reference. + */ +struct xfs_open_zone * +xfs_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_group *xg; + + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); + if (!xg) + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); + if (!xg) + return NULL; + + set_current_state(TASK_RUNNING); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); +} + +static struct xfs_open_zone * +xfs_try_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) + return NULL; + if (atomic_read(&zi->zi_nr_free_zones) < + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) + return NULL; + + /* + * Increment the open zone count to reserve our slot before dropping + * zi_open_zones_lock. + */ + zi->zi_nr_open_zones++; + spin_unlock(&zi->zi_open_zones_lock); + oz = xfs_open_zone(mp, write_hint, false); + spin_lock(&zi->zi_open_zones_lock); + if (!oz) { + zi->zi_nr_open_zones--; + return NULL; + } + + atomic_inc(&oz->oz_ref); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + + /* + * If this was the last free zone, other waiters might be waiting + * on us to write to it as well. + */ + wake_up_all(&zi->zi_zone_wait); + + if (xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + + trace_xfs_zone_opened(oz->oz_rtg); + return oz; +} + +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + +static bool +xfs_try_use_zone( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) +{ + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) + return false; + + /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* + * If we couldn't match by inode or life time we just pick the first + * zone with enough space above. For that we want the least busy zone + * for some definition of "least" busy. For now this simple LRU + * algorithm that rotates every zone to the end of the list will do it, + * even if it isn't exactly cache friendly. + */ + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); + return true; +} + +static struct xfs_open_zone * +xfs_select_open_zone_lru( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static struct xfs_open_zone * +xfs_select_open_zone_mru( + struct xfs_zone_info *zi, + enum rw_hint file_hint) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, false)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + +/* + * Try to pack inodes that are written back after they were closed tight instead + * of trying to open new zones for them or spread them to the least recently + * used zone. This optimizes the data layout for workloads that untar or copy + * a lot of small files. Right now this does not separate multiple such + * streams. + */ +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) +{ + return !inode_is_open_for_write(VFS_I(ip)) && + !(ip->i_diflags & XFS_DIFLAG_APPEND); +} + +/* + * Pick a new zone for writes. + * + * If we aren't using up our budget of open zones just open a new one from the + * freelist. Else try to find one that matches the expected data lifetime. If + * we don't find one that is good pick any zone that is available. + */ +static struct xfs_open_zone * +xfs_select_zone_nowait( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = NULL; + + if (xfs_is_shutdown(mp)) + return NULL; + + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ + spin_lock(&zi->zi_open_zones_lock); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); + if (oz) + goto out_unlock; + + /* + * See if we can open a new zone and use that. + */ + oz = xfs_try_open_zone(mp, write_hint); + if (oz) + goto out_unlock; + + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); +out_unlock: + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +static struct xfs_open_zone * +xfs_select_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + DEFINE_WAIT (wait); + struct xfs_open_zone *oz; + + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + return oz; + + for (;;) { + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + break; + schedule(); + } + finish_wait(&zi->zi_zone_wait, &wait); + return oz; +} + +static unsigned int +xfs_zone_alloc_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t count_fsb, + sector_t *sector, + bool *is_seq) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgblock_t rgbno; + + spin_lock(&oz->oz_alloc_lock); + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + if (!count_fsb) { + spin_unlock(&oz->oz_alloc_lock); + return 0; + } + rgbno = oz->oz_write_pointer; + oz->oz_write_pointer += count_fsb; + spin_unlock(&oz->oz_alloc_lock); + + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); + if (!*is_seq) + *sector += XFS_FSB_TO_BB(mp, rgbno); + return XFS_FSB_TO_B(mp, count_fsb); +} + +void +xfs_mark_rtg_boundary( + struct iomap_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + sector_t sector = ioend->io_bio.bi_iter.bi_sector; + + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; +} + +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + +void +xfs_zone_alloc_and_submit( + struct iomap_ioend *ioend, + struct xfs_open_zone **oz) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); + bool pack_tight = xfs_zoned_pack_tight(ip); + unsigned int alloc_len; + struct iomap_ioend *split; + bool is_seq; + + if (xfs_is_shutdown(mp)) + goto out_error; + + /* + * If we don't have a cached zone in this write context, see if the + * last extent before the one we are writing to points to an active + * zone. If so, just continue writing to it. + */ + if (!*oz && ioend->io_offset) + *oz = xfs_last_used_zone(ioend); + if (!*oz) { +select_zone: + *oz = xfs_select_zone(mp, write_hint, pack_tight); + if (!*oz) + goto out_error; + } + + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), + &ioend->io_sector, &is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { + if (IS_ERR(split)) + goto out_split_error; + alloc_len -= split->io_bio.bi_iter.bi_size; + xfs_submit_zoned_bio(split, *oz, is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + } + + xfs_submit_zoned_bio(ioend, *oz, is_seq); + return; + +out_split_error: + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); +out_error: + bio_io_error(&ioend->io_bio); +} + +/* + * Wake up all threads waiting for a zoned space allocation when the file system + * is shut down. + */ +void +xfs_zoned_wake_all( + struct xfs_mount *mp) +{ + /* + * Don't wake up if there is no m_zone_info. This is complicated by the + * fact that unmount can't atomically clear m_zone_info and thus we need + * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE + * during log recovery so we can't entirely rely on that either. + */ + if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) + wake_up_all(&mp->m_zone_info->zi_zone_wait); +} + +/* + * Check if @rgbno in @rgb is a potentially valid block. It might still be + * unused, but that information is only found in the rmap. + */ +bool +xfs_zone_rgbno_is_valid( + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgbno) +{ + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); + + if (rtg->rtg_open_zone) + return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, + rtg_rgno(rtg), XFS_RTG_FREE); +} + +static void +xfs_free_open_zones( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz; + + spin_lock(&zi->zi_open_zones_lock); + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, + struct xfs_open_zone, oz_entry))) { + list_del(&oz->oz_entry); + xfs_open_zone_put(oz); + } + spin_unlock(&zi->zi_open_zones_lock); +} + +struct xfs_init_zones { + struct xfs_mount *mp; + uint64_t available; + uint64_t reclaimable; +}; + +static int +xfs_init_zone( + struct xfs_init_zones *iz, + struct xfs_rtgroup *rtg, + struct blk_zone *zone) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint64_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgblock_t write_pointer, highest_rgbno; + int error; + + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) + return -EFSCORRUPTED; + + /* + * For sequential write required zones we retrieved the hardware write + * pointer above. + * + * For conventional zones or conventional devices we don't have that + * luxury. Instead query the rmap to find the highest recorded block + * and set the write pointer to the block after that. In case of a + * power loss this misses blocks where the data I/O has completed but + * not recorded in the rmap yet, and it also rewrites blocks if the most + * recently written ones got deleted again before unmount, but this is + * the best we can do without hardware support. + */ + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + write_pointer = 0; + else + write_pointer = highest_rgbno + 1; + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + } + + /* + * If there are no used blocks, but the zone is not in empty state yet + * we lost power before the zoned reset. In that case finish the work + * here. + */ + if (write_pointer == rtg_blocks(rtg) && used == 0) { + error = xfs_zone_gc_reset_sync(rtg); + if (error) + return error; + write_pointer = 0; + } + + if (write_pointer == 0) { + /* zone is empty */ + atomic_inc(&zi->zi_nr_free_zones); + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + iz->available += rtg_blocks(rtg); + } else if (write_pointer < rtg_blocks(rtg)) { + /* zone is open */ + struct xfs_open_zone *oz; + + atomic_inc(&rtg_group(rtg)->xg_active_ref); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + zi->zi_nr_open_zones++; + + iz->available += (rtg_blocks(rtg) - write_pointer); + iz->reclaimable += write_pointer - used; + } else if (used < rtg_blocks(rtg)) { + /* zone fully written, but has freed blocks */ + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + iz->reclaimable += (rtg_blocks(rtg) - used); + } + + return 0; +} + +static int +xfs_get_zone_info_cb( + struct blk_zone *zone, + unsigned int idx, + void *data) +{ + struct xfs_init_zones *iz = data; + struct xfs_mount *mp = iz->mp; + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); + xfs_rgnumber_t rgno; + struct xfs_rtgroup *rtg; + int error; + + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); + return -EFSCORRUPTED; + } + + rgno = xfs_rtb_to_rgno(mp, zsbno); + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) { + xfs_warn(mp, "realtime group not found for zone %u.", rgno); + return -EFSCORRUPTED; + } + error = xfs_init_zone(iz, rtg, zone); + xfs_rtgroup_rele(rtg); + return error; +} + +/* + * Calculate the max open zone limit based on the of number of + * backing zones available + */ +static inline uint32_t +xfs_max_open_zones( + struct xfs_mount *mp) +{ + unsigned int max_open, max_open_data_zones; + /* + * We need two zones for every open data zone, + * one in reserve as we don't reclaim open zones. One data zone + * and its spare is included in XFS_MIN_ZONES. + */ + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; + + /* + * Cap the max open limit to 1/4 of available space + */ + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); + + return max(XFS_MIN_OPEN_ZONES, max_open); +} + +/* + * Normally we use the open zone limit that the device reports. If there is + * none let the user pick one from the command line. + * + * If the device doesn't report an open zone limit and there is no override, + * allow to hold about a quarter of the zones open. In theory we could allow + * all to be open, but at that point we run into GC deadlocks because we can't + * reclaim open zones. + * + * When used on conventional SSDs a lower open limit is advisable as we'll + * otherwise overwhelm the FTL just as much as a conventional block allocator. + * + * Note: To debug the open zone management code, force max_open to 1 here. + */ +static int +xfs_calc_open_zones( + struct xfs_mount *mp) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); + + if (!mp->m_max_open_zones) { + if (bdev_open_zones) + mp->m_max_open_zones = bdev_open_zones; + else + mp->m_max_open_zones = xfs_max_open_zones(mp); + } + + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { + xfs_notice(mp, "need at least %u open zones.", + XFS_MIN_OPEN_ZONES); + return -EIO; + } + + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { + mp->m_max_open_zones = bdev_open_zones; + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + bdev_open_zones); + } + + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { + mp->m_max_open_zones = xfs_max_open_zones(mp); + xfs_info(mp, +"limiting open zones to %u due to total zone count (%u)", + mp->m_max_open_zones, mp->m_sb.sb_rgcount); + } + + return 0; +} + +static unsigned long * +xfs_alloc_bucket_bitmap( + struct xfs_mount *mp) +{ + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); +} + +static struct xfs_zone_info * +xfs_alloc_zone_info( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi; + int i; + + zi = kzalloc(sizeof(*zi), GFP_KERNEL); + if (!zi) + return NULL; + INIT_LIST_HEAD(&zi->zi_open_zones); + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); + spin_lock_init(&zi->zi_reset_list_lock); + spin_lock_init(&zi->zi_open_zones_lock); + spin_lock_init(&zi->zi_reservation_lock); + init_waitqueue_head(&zi->zi_zone_wait); + spin_lock_init(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); + if (!zi->zi_used_bucket_bitmap[i]) + goto out_free_bitmaps; + } + return zi; + +out_free_bitmaps: + while (--i > 0) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); + return NULL; +} + +static void +xfs_free_zone_info( + struct xfs_zone_info *zi) +{ + int i; + + xfs_free_open_zones(zi); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); +} + +int +xfs_mount_zones( + struct xfs_mount *mp) +{ + struct xfs_init_zones iz = { + .mp = mp, + }; + struct xfs_buftarg *bt = mp->m_rtdev_targp; + int error; + + if (!bt) { + xfs_notice(mp, "RT device missing."); + return -EINVAL; + } + + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { + xfs_notice(mp, "invalid flag combination."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rextsize != 1) { + xfs_notice(mp, "zoned file systems do not support rextsize."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { + xfs_notice(mp, +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); + return -EFSCORRUPTED; + } + + error = xfs_calc_open_zones(mp); + if (error) + return error; + + mp->m_zone_info = xfs_alloc_zone_info(mp); + if (!mp->m_zone_info) + return -ENOMEM; + + xfs_info(mp, "%u zones of %u blocks size (%u max open)", + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, + mp->m_max_open_zones); + trace_xfs_zones_mount(mp); + + if (bdev_is_zoned(bt->bt_bdev)) { + error = blkdev_report_zones(bt->bt_bdev, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); + if (error < 0) + goto out_free_zone_info; + } else { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_init_zone(&iz, rtg, NULL); + if (error) + goto out_free_zone_info; + } + } + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); + + error = xfs_zone_gc_mount(mp); + if (error) + goto out_free_zone_info; + return 0; + +out_free_zone_info: + xfs_free_zone_info(mp->m_zone_info); + return error; +} + +void +xfs_unmount_zones( + struct xfs_mount *mp) +{ + xfs_zone_gc_unmount(mp); + xfs_free_zone_info(mp->m_zone_info); +} diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h new file mode 100644 index 000000000000..ecf39106704c --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_ALLOC_H +#define _XFS_ZONE_ALLOC_H + +struct iomap_ioend; +struct xfs_open_zone; + +struct xfs_zone_alloc_ctx { + struct xfs_open_zone *open_zone; + xfs_filblks_t reserved_blocks; +}; + +/* + * Grab any available space, even if it is less than what the caller asked for. + */ +#define XFS_ZR_GREEDY (1U << 0) +/* + * Only grab instantly available space, don't wait or GC. + */ +#define XFS_ZR_NOWAIT (1U << 1) +/* + * Dip into the reserved pool. + */ +#define XFS_ZR_RESERVED (1U << 2) + +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, + unsigned int flags, struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_space_unreserve(struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); + +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, + struct xfs_open_zone **oz); +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, xfs_filblks_t len); +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, + xfs_daddr_t daddr, struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock); +void xfs_open_zone_put(struct xfs_open_zone *oz); + +void xfs_zoned_wake_all(struct xfs_mount *mp); +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); + +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); + +#ifdef CONFIG_XFS_RT +int xfs_mount_zones(struct xfs_mount *mp); +void xfs_unmount_zones(struct xfs_mount *mp); +void xfs_zone_gc_start(struct xfs_mount *mp); +void xfs_zone_gc_stop(struct xfs_mount *mp); +#else +static inline int xfs_mount_zones(struct xfs_mount *mp) +{ + return -EIO; +} +static inline void xfs_unmount_zones(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_start(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_stop(struct xfs_mount *mp) +{ +} +#endif /* CONFIG_XFS_RT */ + +#endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..c5136ea9bb1d --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch pad. This is also the upper bound for each + * GC I/O, which helps to keep latency down. + */ +#define XFS_GC_CHUNK_SIZE SZ_1M + +/* + * Scratchpad data to read GCed data into. + * + * The offset member tracks where the next allocation starts, and freed tracks + * the amount of space that is not used anymore. + */ +#define XFS_ZONE_GC_NR_SCRATCH 2 +struct xfs_zone_scratch { + struct folio *folio; + unsigned int offset; + unsigned int freed; +}; + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + struct xfs_zone_scratch *scratch; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio_vec bv; + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad used, and index to indicated which one is used. + */ + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; + unsigned int scratch_idx; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + return false; + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + mp->m_groups[XG_TYPE_RTG].blocks * + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + return false; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), + GFP_KERNEL); + if (!data->iter.recs) + goto out_free_data; + + /* + * We actually only need a single bio_vec. It would be nice to have + * a flag that only allocates the inline bvecs and not the separate + * bvec pool. + */ + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { + data->scratch[i].folio = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); + if (!data->scratch[i].folio) + goto out_free_scratch; + } + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_next( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + + if (!iter->victim_rtg) + return false; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + if (xfs_is_shutdown(mp)) + return false; + + if (iter->victim_rtg) + return true; + + /* + * Don't start new work if we are asked to stop or park. + */ + if (kthread_should_stop() || kthread_should_park()) + return false; + + if (!xfs_zoned_need_gc(mp)) + return false; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static struct xfs_open_zone * +xfs_zone_gc_steal_open( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || + oz->oz_write_pointer < found->oz_write_pointer) + found = oz; + } + + if (found) { + found->oz_is_gc = true; + list_del_init(&found->oz_entry); + zi->zi_nr_open_zones--; + } + + spin_unlock(&zi->zi_open_zones_lock); + return found; +} + +static struct xfs_open_zone * +xfs_zone_gc_select_target( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = zi->zi_open_gc_zone; + + /* + * We need to wait for pending writes to finish. + */ + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) + return NULL; + + ASSERT(zi->zi_nr_open_zones <= + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (oz) + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_open_gc_zone = oz; + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +/* + * Ensure we have a valid open zone to write the GC data to. + * + * If the current target zone has space keep writing to it, else first wait for + * all pending writes and then pick a new one. + */ +static struct xfs_open_zone * +xfs_zone_gc_ensure_target( + struct xfs_mount *mp) +{ + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return xfs_zone_gc_select_target(mp); + return oz; +} + +static unsigned int +xfs_zone_gc_scratch_available( + struct xfs_zone_gc_data *data) +{ + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; +} + +static bool +xfs_zone_gc_space_available( + struct xfs_zone_gc_data *data) +{ + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(data->mp); + if (!oz) + return false; + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + xfs_zone_gc_scratch_available(data); +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static struct xfs_open_zone * +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(mp); + if (!oz) + return NULL; + + *count_fsb = min(*count_fsb, + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return NULL; + + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); + oz->oz_write_pointer += *count_fsb; + atomic_inc(&oz->oz_ref); + return oz; +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_open_zone *oz; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (xfs_is_shutdown(mp)) + return false; + + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + return false; + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq); + if (!oz) { + xfs_irele(ip); + return false; + } + + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->scratch = &data->scratch[data->scratch_idx]; + chunk->data = data; + chunk->oz = oz; + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, + chunk->scratch->offset); + chunk->scratch->offset += chunk->len; + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { + data->scratch_idx = + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; + } + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->scratch = chunk->scratch; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, + folio_offset); + + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + chunk->scratch->freed += chunk->len; + if (chunk->scratch->freed == chunk->scratch->offset) { + chunk->scratch->offset = 0; + chunk->scratch->freed = 0; + } + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static bool +xfs_zone_gc_prepare_reset( + struct bio *bio, + struct xfs_rtgroup *rtg) +{ + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + if (!bdev_max_discard_sectors(bio->bi_bdev)) + return false; + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; + bio->bi_iter.bi_size = + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); + } + + return true; +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + int error = 0; + struct bio bio; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET); + if (xfs_zone_gc_prepare_reset(&bio, rtg)) + error = submit_bio_wait(&bio); + bio_uninit(&bio); + + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (xfs_zone_gc_prepare_reset(bio, rtg)) + submit_bio(bio); + else + bio_endio(bio); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static bool +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (!xfs_zone_gc_select_victim(data) || + !xfs_zone_gc_space_available(data)) { + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !reset_list) + return false; + } + + __set_current_state(TASK_RUNNING); + try_to_freeze(); + + if (reset_list) + xfs_zone_gc_reset_zones(data, reset_list); + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + return true; +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + if (xfs_zone_gc_handle_work(data)) + continue; + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + struct xfs_open_zone *oz; + int error; + + /* + * If there are no free zones available for GC, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown near ENOSPC while GC was ongoing. + * + * We also need to do this for the first gc zone allocation if we + * unmounted while at the open limit. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones == mp->m_max_open_zones) + oz = xfs_zone_gc_steal_open(zi); + else + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (!oz) { + xfs_warn(mp, "unable to allocate a zone for gc"); + error = -EIO; + goto out; + } + + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + zi->zi_open_gc_zone = oz; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) { + error = -ENOMEM; + goto out_put_gc_zone; + } + + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + goto out_free_gc_data; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(mp->m_zone_info->zi_gc_thread); + return 0; + +out_free_gc_data: + kfree(data); +out_put_gc_zone: + xfs_open_zone_put(zi->zi_open_gc_zone); +out: + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); + if (zi->zi_open_gc_zone) + xfs_open_zone_put(zi->zi_open_gc_zone); +} diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c new file mode 100644 index 000000000000..733bcc2f8645 --- /dev/null +++ b/fs/xfs/xfs_zone_info.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" + +static const char xfs_write_hint_shorthand[6][16] = { + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; + +static inline const char * +xfs_write_hint_to_str( + uint8_t write_hint) +{ + if (write_hint > WRITE_LIFE_EXTREME) + return "UNKNOWN"; + return xfs_write_hint_shorthand[write_hint]; +} + +static void +xfs_show_open_zone( + struct seq_file *m, + struct xfs_open_zone *oz) +{ + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + rtg_rgno(oz->oz_rtg), + oz->oz_write_pointer, oz->oz_written, + rtg_rmap(oz->oz_rtg)->i_used_blocks, + xfs_write_hint_to_str(oz->oz_write_hint)); +} + +static void +xfs_show_full_zone_used_distribution( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int reclaimable = 0, full, i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + unsigned int entries = zi->zi_used_bucket_entries[i]; + + seq_printf(m, "\t %2u..%2u%%: %u\n", + i * (100 / XFS_ZONE_USED_BUCKETS), + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, + entries); + reclaimable += entries; + } + spin_unlock(&zi->zi_used_buckets_lock); + + full = mp->m_sb.sb_rgcount; + if (zi->zi_open_gc_zone) + full--; + full -= zi->zi_nr_open_zones; + full -= atomic_read(&zi->zi_nr_free_zones); + full -= reclaimable; + + seq_printf(m, "\t 100%%: %u\n", full); +} + +void +xfs_zoned_show_stats( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + seq_puts(m, "\n"); + + seq_printf(m, "\tuser free RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + seq_printf(m, "\treserved free RT blocks: %lld\n", + mp->m_free[XC_FREE_RTEXTENTS].res_avail); + seq_printf(m, "\tuser available RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); + seq_printf(m, "\treserved available RT blocks: %lld\n", + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + seq_printf(m, "\tRT reservations required: %d\n", + !list_empty_careful(&zi->zi_reclaim_reservations)); + seq_printf(m, "\tRT GC required: %d\n", + xfs_zoned_need_gc(mp)); + + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); + seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + xfs_show_open_zone(m, oz); + if (zi->zi_open_gc_zone) { + seq_puts(m, "\topen gc zone:\n"); + xfs_show_open_zone(m, zi->zi_open_gc_zone); + } + spin_unlock(&zi->zi_open_zones_lock); + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); + xfs_show_full_zone_used_distribution(m, mp); +} diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h new file mode 100644 index 000000000000..ab696975a993 --- /dev/null +++ b/fs/xfs/xfs_zone_priv.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_PRIV_H +#define _XFS_ZONE_PRIV_H + +struct xfs_open_zone { + /* + * Entry in the open zone list and refcount. Protected by + * zi_open_zones_lock in struct xfs_zone_info. + */ + struct list_head oz_entry; + atomic_t oz_ref; + + /* + * oz_write_pointer is the write pointer at which space is handed out + * for conventional zones, or simple the count of blocks handed out + * so far for sequential write required zones and is protected by + * oz_alloc_lock/ + */ + spinlock_t oz_alloc_lock; + xfs_rgblock_t oz_write_pointer; + + /* + * oz_written is the number of blocks for which we've received a + * write completion. oz_written must always be <= oz_write_pointer + * and is protected by the ILOCK of the rmap inode. + */ + xfs_rgblock_t oz_written; + + /* + * Write hint (data temperature) assigned to this zone, or + * WRITE_LIFE_NOT_SET if none was set. + */ + enum rw_hint oz_write_hint; + + /* + * Is this open zone used for garbage collection? There can only be a + * single open GC zone, which is pointed to by zi_open_gc_zone in + * struct xfs_zone_info. Constant over the life time of an open zone. + */ + bool oz_is_gc; + + /* + * Pointer to the RT groups structure for this open zone. Constant over + * the life time of an open zone. + */ + struct xfs_rtgroup *oz_rtg; +}; + +/* + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets + * so that each 10% of the usable capacity get their own bucket and GC can + * only has to walk the bitmaps of the lesser used zones if there are any. + */ +#define XFS_ZONE_USED_BUCKETS 10u + +struct xfs_zone_info { + /* + * List of pending space reservations: + */ + spinlock_t zi_reservation_lock; + struct list_head zi_reclaim_reservations; + + /* + * List and number of open zones: + */ + spinlock_t zi_open_zones_lock; + struct list_head zi_open_zones; + unsigned int zi_nr_open_zones; + + /* + * Free zone search cursor and number of free zones: + */ + unsigned long zi_free_zone_cursor; + atomic_t zi_nr_free_zones; + + /* + * Wait queue to wait for free zones or open zone resources to become + * available: + */ + wait_queue_head_t zi_zone_wait; + + /* + * Pointer to the GC thread, and the current open zone used by GC + * (if any). + * + * zi_open_gc_zone is mostly private to the GC thread, but can be read + * for debugging from other threads, in which case zi_open_zones_lock + * must be taken to access it. + */ + struct task_struct *zi_gc_thread; + struct xfs_open_zone *zi_open_gc_zone; + + /* + * List of zones that need a reset: + */ + spinlock_t zi_reset_list_lock; + struct xfs_group *zi_reset_list; + + /* + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help + * garbage collection to quickly find the best candidate for reclaim. + */ + spinlock_t zi_used_buckets_lock; + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; + +}; + +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc); + +int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); +bool xfs_zoned_need_gc(struct xfs_mount *mp); +int xfs_zone_gc_mount(struct xfs_mount *mp); +void xfs_zone_gc_unmount(struct xfs_mount *mp); + +void xfs_zoned_resv_wake_all(struct xfs_mount *mp); + +#endif /* _XFS_ZONE_PRIV_H */ diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c new file mode 100644 index 000000000000..93c9a7721139 --- /dev/null +++ b/fs/xfs/xfs_zone_space_resv.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" + +/* + * Note: the zoned allocator does not support a rtextsize > 1, so this code and + * the allocator itself uses file system blocks interchangeable with realtime + * extents without doing the otherwise required conversions. + */ + +/* + * Per-task space reservation. + * + * Tasks that need to wait for GC to free up space allocate one of these + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. + * The GC thread will then wake the tasks in order when space becomes available. + */ +struct xfs_zone_reservation { + struct list_head entry; + struct task_struct *task; + xfs_filblks_t count_fsb; +}; + +/* + * Calculate the number of reserved blocks. + * + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly + * available for writes without waiting for GC. + * + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS + * is further restricted by at least one zone as well as the optional + * persistently reserved blocks. This allows the allocator to run more + * smoothly by not always triggering GC. + */ +uint64_t +xfs_zoned_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + switch (ctr) { + case XC_FREE_RTEXTENTS: + return (uint64_t)XFS_RESERVED_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks + + mp->m_sb.sb_rtreserved; + case XC_FREE_RTAVAILABLE: + return (uint64_t)XFS_GC_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks; + default: + ASSERT(0); + return 0; + } +} + +void +xfs_zoned_resv_wake_all( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + spin_lock(&zi->zi_reservation_lock); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) + wake_up_process(reservation->task); + spin_unlock(&zi->zi_reservation_lock); +} + +void +xfs_zoned_add_available( + struct xfs_mount *mp, + xfs_filblks_t count_fsb) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + if (list_empty_careful(&zi->zi_reclaim_reservations)) { + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + return; + } + + spin_lock(&zi->zi_reservation_lock); + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { + if (reservation->count_fsb > count_fsb) + break; + wake_up_process(reservation->task); + count_fsb -= reservation->count_fsb; + + } + spin_unlock(&zi->zi_reservation_lock); +} + +static int +xfs_zoned_space_wait_error( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return -EIO; + if (fatal_signal_pending(current)) + return -EINTR; + return 0; +} + +static int +xfs_zoned_reserve_available( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation reservation = { + .task = current, + .count_fsb = count_fsb, + }; + int error; + + /* + * If there are no waiters, try to directly grab the available blocks + * from the percpu counter. + * + * If the caller wants to dip into the reserved pool also bypass the + * wait list. This relies on the fact that we have a very graciously + * sized reserved pool that always has enough space. If the reserved + * allocations fail we're in trouble. + */ + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || + (flags & XFS_ZR_RESERVED))) { + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + return error; + } + + if (flags & XFS_ZR_NOWAIT) + return -EAGAIN; + + spin_lock(&zi->zi_reservation_lock); + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { + set_current_state(TASK_KILLABLE); + + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + break; + + /* + * Make sure to start GC if it is not running already. As we + * check the rtavailable count when filling up zones, GC is + * normally already running at this point, but in some setups + * with very few zones we may completely run out of non- + * reserved blocks in between filling zones. + */ + if (!xfs_is_zonegc_running(mp)) + wake_up_process(zi->zi_gc_thread); + + /* + * If there is no reclaimable group left and we aren't still + * processing a pending GC request give up as we're fully out + * of space. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + !xfs_is_zonegc_running(mp)) + break; + + spin_unlock(&zi->zi_reservation_lock); + schedule(); + spin_lock(&zi->zi_reservation_lock); + } + list_del(&reservation.entry); + spin_unlock(&zi->zi_reservation_lock); + + __set_current_state(TASK_RUNNING); + return error; +} + +/* + * Implement greedy space allocation for short writes by trying to grab all + * that is left after locking out other threads from trying to do the same. + * + * This isn't exactly optimal and can hopefully be replaced by a proper + * percpu_counter primitive one day. + */ +static int +xfs_zoned_reserve_extents_greedy( + struct xfs_inode *ip, + xfs_filblks_t *count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + s64 len = *count_fsb; + int error = -ENOSPC; + + spin_lock(&zi->zi_reservation_lock); + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + if (len > 0) { + *count_fsb = len; + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, + flags & XFS_ZR_RESERVED); + } + spin_unlock(&zi->zi_reservation_lock); + return error; +} + +int +xfs_zoned_space_reserve( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + ASSERT(ac->reserved_blocks == 0); + ASSERT(ac->open_zone == NULL); + + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + if (error) + return error; + + error = xfs_zoned_reserve_available(ip, count_fsb, flags); + if (error) { + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); + return error; + } + ac->reserved_blocks = count_fsb; + return 0; +} + +void +xfs_zoned_space_unreserve( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (ac->reserved_blocks > 0) { + struct xfs_mount *mp = ip->i_mount; + + xfs_zoned_add_available(mp, ac->reserved_blocks); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); + } + if (ac->open_zone) + xfs_open_zone_put(ac->open_zone); +} |