diff options
Diffstat (limited to 'fs/btrfs')
75 files changed, 2383 insertions, 1967 deletions
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..15ea6348800b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a4c51600a408..f3bffe08b290 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -296,7 +295,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +306,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3d3923cfc357..5936cff80ff3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc2555c44a12..8c2eee1f1878 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } @@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0a8f7d92acc..a8129f1ce78c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new, /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); exist = rb_find_add_cached(&block_group->cache_node, - &info->block_group_cache_tree, btrfs_bg_start_cmp); + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist) ret = -EEXIST; - write_unlock(&info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); return ret; } @@ -584,7 +584,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -626,7 +626,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -738,8 +737,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -785,8 +784,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -1457,6 +1456,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1571,8 +1596,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1823,7 +1847,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1887,6 +1912,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1903,31 +1939,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", + "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", bg->start, - div64_u64(bg->used * 100, bg->length), + div64_u64(used * 100, bg->length), + div64_u64(reserved * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1936,24 +1988,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1993,13 +2034,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2410,7 +2446,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; @@ -2459,7 +2495,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); -out: - btrfs_free_path(path); + return ret; } @@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2888,7 +2926,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -3306,7 +3344,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3323,7 +3361,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3346,7 +3383,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3501,7 +3538,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3512,7 +3548,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3624,7 +3660,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..4e2952cf5766 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,6 +517,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes @@ -524,7 +533,7 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -584,9 +593,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..e7f8ee5d48a4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = min(max(level, ops->min_level), ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1590,18 +1598,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..df198623cc08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -72,16 +72,6 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) { @@ -93,7 +83,8 @@ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +98,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -131,14 +122,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +179,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f..a2e7979372cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -4304,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4318,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4606,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4642,7 +4642,8 @@ again: goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4666,8 +4667,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4677,11 +4678,10 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; + /* Save our key for returning back. */ + btrfs_node_key_to_cpu(cur, min_key, slot); goto out; } cur = btrfs_read_node_slot(cur, slot); @@ -4698,10 +4698,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { + if (ret == 0) btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..075a06db43a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,7 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..d4310d93f532 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -225,7 +225,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +250,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -1352,17 +1352,18 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1377,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } } if (extent_thresh == 0) @@ -1402,8 +1414,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1432,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1476,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1474,9 +1488,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b4933c6a889..3f1551d8a5c6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1211,7 +1211,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1238,7 +1238,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1817,53 +1816,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1876,39 +1875,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1928,8 +1926,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1937,7 +1934,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..c4b4ba122beb 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a35067cebb97..f5ae880308d3 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f86fbea0b3de..53d7d85cb4be 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ccf91de29f80..b29cc31a7c4a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..8462579a95f4 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,6 +10,7 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 8567af46e16f..a374ce7a1813 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,7 +248,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); block_start = extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -855,6 +856,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..d6eef4bd9e9d 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -260,9 +254,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..3dd555db3d32 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,13 +182,12 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, @@ -284,8 +283,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -1089,21 +1087,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1127,7 +1126,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1136,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1144,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -1568,7 +1564,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1583,9 +1579,9 @@ static int transaction_kthread(void *arg) cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay -= msecs_to_jiffies((delta - 1) * 1000); + delay -= secs_to_jiffies(delta - 1); delay = min(delay, - msecs_to_jiffies(fs_info->commit_interval * 1000)); + secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; @@ -2200,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2210,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2447,21 +2442,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2561,6 +2562,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + ret = validate_sys_chunk_array(fs_info, sb); /* @@ -3390,7 +3394,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; @@ -3416,11 +3419,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -4326,6 +4324,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4346,6 +4352,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4369,6 +4400,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); @@ -4403,9 +4436,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4528,10 +4558,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4902,7 +4928,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4918,14 +4944,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4936,10 +4961,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..13de6af279e5 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -346,10 +346,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { btrfs_panic(extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3014a1a23efd..957230abd827 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1679,8 +1674,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1688,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1701,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } @@ -2348,8 +2342,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) @@ -2874,7 +2868,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) block_group->length, &trimmed); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -5465,7 +5467,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5482,7 +5484,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5517,7 +5518,6 @@ again: mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -6285,7 +6285,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6298,10 +6298,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6338,7 +6336,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index cfa52264f678..0ed682d9ed7b 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2cac9d0a1a9..197f5e51c474 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -425,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -488,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -754,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, { struct btrfs_inode *inode = folio_to_inode(folio); - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && @@ -837,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -871,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -888,8 +887,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -899,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -915,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -939,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -956,28 +948,32 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); @@ -989,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1046,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1068,22 +1059,204 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -1102,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1115,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1154,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1181,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1261,7 +1435,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1306,7 +1480,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1321,7 +1495,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1330,7 +1504,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1431,6 +1605,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1439,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1527,6 +1704,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1548,6 +1726,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1559,14 +1761,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1722,20 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock( static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); @@ -1789,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } @@ -1846,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { + while (bit_start < blocks_per_folio) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1864,7 +2043,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&folio->mapping->i_private_lock); @@ -1930,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct if (!folio_test_private(folio)) return 0; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); @@ -2189,10 +2368,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2230,7 +2407,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2311,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2322,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2380,12 +2562,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + lock_extents_for_read(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); @@ -2432,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { @@ -2470,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2581,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -2607,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2618,7 +2808,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); spin_unlock(&folio->mapping->i_private_lock); } @@ -2651,15 +2841,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2668,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } @@ -2676,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2696,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(src); i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); @@ -2712,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return new; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto out; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto out_detach; } set_extent_buffer_uptodate(eb); @@ -2739,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { + +out_detach: + for (int i = 0; i < num_extent_folios(eb); i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); folio_put(eb->folios[i]); } } +out: kmem_cache_free(extent_buffer_cache, eb); return NULL; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); -} - static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -2794,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2831,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2870,8 +3050,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2885,7 +3068,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2988,7 +3171,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3030,8 +3213,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3059,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3079,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3095,9 +3276,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3137,7 +3317,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3148,7 +3328,7 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* @@ -3191,7 +3371,7 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_unlock(eb->folios[i]); return eb; @@ -3222,7 +3402,7 @@ out: } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. + * so it can be cleaned up without utilizing folio->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3322,11 +3502,10 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) __xa_clear_mark(&folio->mapping->i_pages, @@ -3334,26 +3513,10 @@ static void btree_clear_folio_dirty(struct folio *folio) xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3380,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3398,37 +3560,34 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3436,54 +3595,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3496,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3521,19 +3654,6 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } - clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3544,7 +3664,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3584,19 +3703,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3785,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4271,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..2e261892c7bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -252,8 +252,6 @@ void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -276,7 +274,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -294,8 +293,10 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { if (folio_order(eb->folios[0])) return 1; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..7f46abbd6311 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,12 +1187,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d04a3b47b1fb..344b4db487a0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); -out: - btrfs_free_path(path); + return ret; } @@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -341,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; @@ -373,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); + if (!bbio->csum) return BLK_STS_RESOURCE; - } } else { bbio->csum = bbio->csum_inline; } @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1074,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..6181a70ec3ef 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb1..262a707d8990 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -874,7 +875,6 @@ again: ret = PTR_ERR(folio); return ret; } - folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); @@ -1014,8 +1014,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1039,7 +1038,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1064,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1092,7 +1089,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) u64 lockend; size_t num_written = 0; ssize_t ret; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1105,6 +1102,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; @@ -1778,6 +1782,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; + size_t fsize = folio_size(folio); vm_fault_t ret; int ret2; int reserved = 0; @@ -1788,7 +1793,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ASSERT(folio_order(folio) == 0); - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1842,7 +1847,7 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); @@ -1854,11 +1859,11 @@ again: if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + fsize - reserved_space, true); } } @@ -1885,12 +1890,12 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); @@ -1899,7 +1904,7 @@ again: unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -1908,7 +1913,7 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d42b6f882f57..05e173311c1a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } - ret = set_folio_extent_mapped(page_folio(page)); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1156,8 +1155,8 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..39c6b96a4c25 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1062,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1217,7 +1215,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1233,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1242,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1638,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1657,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1670,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 09cfb43580cb..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" #include "volumes.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..bcca43046064 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -485,8 +500,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -709,7 +724,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -981,6 +995,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 448aa1a682d6..3530de0618c8 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -191,8 +191,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -317,8 +317,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -493,8 +493,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9322601ab5c..cc67d1a2d611 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -672,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -832,7 +823,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct folio *folio; @@ -840,13 +831,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -886,6 +877,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -894,7 +886,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -968,13 +960,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1090,7 +1084,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); @@ -1272,10 +1265,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1382,8 +1372,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1487,11 +1482,9 @@ out_unlock: /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). + * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV - * are also handled by the cleanup function. + * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. @@ -1502,6 +1495,8 @@ out_unlock: if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } @@ -1971,6 +1966,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2015,15 +2069,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2149,75 +2200,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2226,11 +2223,11 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; if (ret) { cow_end = end; goto error; } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2244,27 +2241,44 @@ error: * start cur_offset end * |/////////////| | * + * In this case, cow_start should be (u64)-1. + * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. - * Only need to clear the dirty flag as they will never be submitted. - * Ordered extent and extent maps are handled by - * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. * - * 2) Failed with error from fallback_to_cow() - * start cur_offset cow_end end + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end * |/////////////|-----------| | * - * For range [start, cur_offset) it's the same as case 1). - * But for range [cur_offset, cow_end), the folios have dirty flag - * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * In this case, both @cow_start and @cow_end is set. * - * Thus we should not call extent_clear_unlock_delalloc() on range - * [cur_offset, cow_end), as the folios are already unlocked. + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. * - * So clear the folio dirty flags for [start, cur_offset) first. + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ - if (cur_offset > start) + if (cow_start != (u64)-1) + cur_offset = cow_start; + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } /* * If an error happened while a COW region is outstanding, cur_offset @@ -2329,7 +2343,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2343,10 +2357,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2873,6 +2883,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2891,7 +2916,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2947,8 +2972,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2983,8 +3008,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -3402,6 +3427,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3522,7 +3548,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3541,6 +3566,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3664,10 +3691,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3690,7 +3717,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3840,12 +3867,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3854,7 +3882,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3864,7 +3892,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3884,41 +3912,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3930,9 +3958,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3961,7 +3988,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3969,15 +3996,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3985,13 +4012,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3999,50 +4025,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -5597,7 +5622,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5609,40 +5634,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5650,7 +5677,7 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5662,43 +5689,46 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5712,15 +5742,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5737,18 +5767,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5763,19 +5793,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -6248,7 +6281,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6334,6 +6367,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6427,7 +6461,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6437,11 +6471,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6539,7 +6575,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6739,18 +6775,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6759,6 +6795,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6775,7 +6812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6787,14 +6824,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6809,14 +6847,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -7057,17 +7095,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7077,8 +7115,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) goto out; @@ -7093,7 +7131,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; @@ -7114,7 +7152,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.end = offset + *len - 1; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; @@ -7130,7 +7168,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.file_extent.offset)) goto out; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; @@ -7235,7 +7273,7 @@ static void wait_subpage_spinlock(struct folio *folio) struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7259,7 +7297,7 @@ static void wait_subpage_spinlock(struct folio *folio) static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8494,8 +8532,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8506,30 +8542,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8537,8 +8573,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8655,7 +8691,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8694,8 +8735,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -9141,7 +9182,7 @@ out: } struct btrfs_encoded_read_private { - struct completion done; + struct completion *sync_reads; void *uring_ctx; refcount_t pending_refs; blk_status_t status; @@ -9153,11 +9194,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); @@ -9169,7 +9209,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - complete(&priv->done); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9180,16 +9220,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_completion(&priv->done); refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9232,11 +9282,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { if (!refcount_dec_and_test(&priv->pending_refs)) - wait_for_completion_io(&priv->done); + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c18bad53cd3..a13d81bb56a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1336,15 +1335,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1447,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline int key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1473,7 +1472,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1604,13 +1603,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2571,7 +2570,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2763,7 +2762,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2817,7 +2816,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -4248,7 +4247,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4384,7 +4383,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4415,7 +4414,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -5242,7 +5241,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5264,9 +5263,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5314,10 +5313,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ce915fcda43b..e08ea446cf48 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -9,6 +9,8 @@ struct file; struct dentry; struct mnt_idmap; struct fileattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; @@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..81e62b652e21 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..03c945711003 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -1229,6 +1242,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e6..d6fa36674270 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; @@ -1897,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..15c296cb4dac 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -165,7 +165,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +175,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +185,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +205,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +214,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +236,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,7 +270,7 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -281,9 +281,9 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); out: if (!ret && !trans) { /* @@ -318,7 +318,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +526,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +617,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +646,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +679,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +689,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -775,24 +777,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +813,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +834,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +863,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +872,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +884,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af0969b70b53..f948f4f6431c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3239,21 +3239,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3313,8 +3315,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3761,10 +3762,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3792,18 +3793,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); + if (inode) + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3977,7 +3979,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -3986,7 +3988,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..2c5edcee9450 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1380,11 +1380,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2497,8 +2497,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..0c8c58c4f29b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -16,7 +16,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +177,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -826,7 +823,7 @@ static int send_rename(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +831,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -851,7 +847,7 @@ static int send_link(struct send_ctx *sctx, ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +855,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -875,14 +870,13 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -898,14 +892,13 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1897,7 +1890,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1901,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1946,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1959,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2315,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2327,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2348,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2371,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2390,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2429,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2488,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,6 +2588,47 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2599,17 +2637,14 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,7 +2652,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2629,17 +2664,14 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,7 +2679,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2662,17 +2694,14 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,7 +2709,7 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2693,17 +2722,14 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", ino, uid, gid); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,7 +2738,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } @@ -2729,9 +2755,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) btrfs_debug(fs_info, "send_utimes %llu", ino); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2782,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2793,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -3106,6 +3129,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4186,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,9 +4217,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; @@ -4216,14 +4261,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4489,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4547,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4553,6 +4593,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4701,7 +4743,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4752,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4766,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4781,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4795,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4806,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4819,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4905,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4926,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4933,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4945,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4952,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4975,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_remove_xattr(sctx, p, name, name_len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5111,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5166,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5181,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5211,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5343,31 +5343,25 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); - if (ret < 0) - goto out; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5380,6 +5374,7 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; btrfs_debug(sctx->send_root->fs_info, @@ -5387,6 +5382,10 @@ static int send_clone(struct send_ctx *sctx, offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -5395,13 +5394,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5452,17 +5447,13 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); @@ -5471,8 +5462,6 @@ static int send_update_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5501,12 +5490,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5527,7 +5514,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5535,9 +5521,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5546,23 +5530,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5578,12 +5552,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5591,9 +5565,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5602,7 +5573,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5617,9 +5588,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5627,10 +5598,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5672,7 +5639,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5698,8 +5665,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5741,15 +5707,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5828,7 +5793,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5854,25 +5818,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6898,6 +6856,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -8107,10 +8066,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8173,6 +8131,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8449,6 +8408,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a341d087567a..ff089e3e4103 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 722acf768396..11dbd7be6a3b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -64,35 +63,14 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) -{ - if (fs_info->sectorsize >= PAGE_SIZE) - return false; - - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + ASSERT(!folio_test_large(folio)); + /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. @@ -101,10 +79,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); + subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); if (IS_ERR(subpage)) return PTR_ERR(subpage); @@ -112,12 +94,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_detach_private(folio); @@ -126,15 +113,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol } struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) + size_t fsize, enum btrfs_subpage_type type) { struct btrfs_subpage *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -165,7 +153,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -179,7 +167,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -206,16 +194,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,7 +223,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + folio_size(folio), orig_start + orig_len) - *start; } @@ -296,7 +286,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -323,13 +313,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -341,7 +332,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, } spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } @@ -352,15 +343,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -372,7 +375,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -426,7 +429,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -467,7 +470,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } @@ -498,7 +501,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -513,7 +516,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -569,7 +572,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +582,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +592,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +600,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +611,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +622,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,26 +660,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const int sectors_per_page = fs_info->sectors_per_page; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ + ASSERT(blocks_per_folio < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ - GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ - fs_info->sectors_per_page, &bitmap); \ + blocks_per_folio, &bitmap); \ } /* @@ -672,7 +700,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } @@ -707,7 +735,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_get_private(folio); @@ -721,15 +749,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&subpage->lock, flags); } +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; +} + void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -739,28 +789,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); + ASSERT(blocks_per_folio > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &locked_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, @@ -771,10 +821,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 44fff1f4eac4..3042c5ea840a 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -69,23 +70,49 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct folio *folio) +{ + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); +} #else +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return false; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return false; } #endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); + size_t fsize, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc4fee519ca6..40709e2a44fc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -947,7 +947,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int err; @@ -982,7 +982,7 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { err = -ENOMEM; goto fail_close; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 53b846d99ece..b9af74498b0c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1330,29 +1331,30 @@ MODULE_PARM_DESC(read_policy, int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { - char param[32] = { 0 }; + char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; - strncpy(param, str, sizeof(param) - 1); + strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3fc5c6f90dc4..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf..265370e79a54 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..74aca7180a5a 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -525,7 +525,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +542,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -730,7 +730,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..609bb6c9c087 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a3..f26a394a9ec5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -274,8 +280,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); @@ -1633,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1659,7 +1667,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1688,8 +1696,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1697,16 +1705,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1714,7 +1722,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1815,7 +1823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1853,18 +1861,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2094,7 +2102,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..90dc094cfa5e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; /* * We're holding a transaction handle whether we are logging or @@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +511,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +525,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +544,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,30 +561,28 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: btrfs_release_path(path); @@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) +static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, + u64 objectid) { - struct inode *inode; + struct btrfs_inode *inode; inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) - inode = NULL; + return NULL; return inode; } @@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -1148,7 +1134,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1188,10 +1174,10 @@ again: btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); } - iput(victim_parent); + iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1325,7 +1311,7 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); dir = read_one_inode(root, parent_id); @@ -1334,10 +1320,9 @@ again: kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1611,25 +1595,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) @@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; inode = read_one_inode(root, location->objectid); @@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, dir = read_one_inode(root, dirid); if (!dir) { - iput(inode); + iput(&inode->vfs_inode); return -EIO; } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; @@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2184,7 +2167,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; inode = read_one_inode(root, key.objectid); @@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = -EIO; break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5497,7 +5478,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5524,17 +5505,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5576,14 +5556,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6789,7 +6767,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6838,18 +6816,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6874,7 +6850,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6889,11 +6865,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7061,26 +7036,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; /* * Skip already logged inodes or inodes corresponding to tmpfiles @@ -7089,14 +7058,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if ((btrfs_inode_in_log(inode, trans->transid) && list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + inode->vfs_inode.i_nlink == 0) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7115,8 +7082,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7171,8 +7141,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7182,7 +7150,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7247,8 +7215,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a0776489055..c8c21c55be53 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1798,8 +1798,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1918,8 +1918,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2721,8 +2721,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -3119,8 +3119,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3577,8 +3577,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -4184,8 +4184,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -5001,8 +5001,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -7155,6 +7155,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7200,8 +7201,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7534,8 +7539,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 120f65e21eeb..e247d551da67 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,17 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c9e92c6941ec..545f413d81fc 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,47 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + /* No large folio support yet. */ + ASSERT(!folio_test_large(folio)); + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = min(bytes_left, - in_buf_folios << PAGE_SHIFT); + workspace->strm.avail_in = copy_length; } else { unsigned int pg_off; unsigned int cur_len; @@ -463,6 +489,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73e0aa9fc08a..fb8b8b29c169 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2111,6 +2111,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2272,6 +2275,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2325,6 +2331,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..3541efa765c7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -26,11 +26,12 @@ #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -218,7 +225,7 @@ void zstd_cleanup_workspace_manager(void) } spin_unlock_bh(&wsm.lock); - del_timer_sync(&wsm.timer); + timer_delete_sync(&wsm.timer); } /* @@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_page(start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +722,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; |