diff options
Diffstat (limited to 'fs/btrfs')
65 files changed, 4054 insertions, 1508 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3cfc440c636c..2d5f0482678b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ - tests/raid-stripe-tree-tests.o + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 361a866c1995..a4c51600a408 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -18,7 +18,7 @@ enum { }; #define NO_THRESHOLD (-1) -#define DFT_THRESHOLD (32) +#define DEFAULT_THRESHOLD (32) struct btrfs_workqueue { struct workqueue_struct *normal_wq; @@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, ret->limit_active = limit_active; if (thresh == 0) - thresh = DFT_THRESHOLD; + thresh = DEFAULT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ - if (thresh < DFT_THRESHOLD) { + if (thresh < DEFAULT_THRESHOLD) { ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04f53ca548e1..3d3923cfc357 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1, return 0; } +static int prelim_ref_rb_add_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct prelim_ref *ref_new = + rb_entry(new, struct prelim_ref, rbnode); + const struct prelim_ref *ref_exist = + rb_entry(exist, struct prelim_ref, rbnode); + + /* + * prelim_ref_compare() expects the first parameter as the existing one, + * different from the rb_find_add_cached() order. + */ + return prelim_ref_compare(ref_exist, ref_new); +} + static void update_share_count(struct share_check *sc, int oldcount, int newcount, const struct prelim_ref *newref) { @@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct share_check *sc) { struct rb_root_cached *root; - struct rb_node **p; - struct rb_node *parent = NULL; - struct prelim_ref *ref; - int result; - bool leftmost = true; + struct rb_node *exist; root = &preftree->root; - p = &root->rb_root.rb_node; + exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp); + if (exist) { + struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode); + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; - while (*p) { - parent = *p; - ref = rb_entry(parent, struct prelim_ref, rbnode); - result = prelim_ref_compare(ref, newref); - if (result < 0) { - p = &(*p)->rb_left; - } else if (result > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - /* Identical refs, merge them and free @newref */ - struct extent_inode_elem *eie = ref->inode_list; - - while (eie && eie->next) - eie = eie->next; + while (eie && eie->next) + eie = eie->next; - if (!eie) - ref->inode_list = newref->inode_list; - else - eie->next = newref->inode_list; - trace_btrfs_prelim_ref_merge(fs_info, ref, newref, - preftree->count); - /* - * A delayed ref can have newref->count < 0. - * The ref->count is updated to follow any - * BTRFS_[ADD|DROP]_DELAYED_REF actions. - */ - update_share_count(sc, ref->count, - ref->count + newref->count, newref); - ref->count += newref->count; - free_pref(newref); - return; - } + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count, newref); + ref->count += newref->count; + free_pref(newref); + return; } update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); - rb_link_node(&newref->rbnode, parent, p); - rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -3022,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; @@ -3132,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { - struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; - BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); - upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - - /* - * Add the node to leaf node list if no other child block - * cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } } btrfs_backref_drop_node(cache, node); @@ -3166,33 +3150,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; - int i; - - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); - btrfs_backref_cleanup_node(cache, node); - } - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct btrfs_backref_node, lower); + while ((node = rb_entry_safe(rb_first(&cache->rb_root), + struct btrfs_backref_node, rb_node))) btrfs_backref_cleanup_node(cache, node); - } - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&cache->pending[i])) { - node = list_first_entry(&cache->pending[i], - struct btrfs_backref_node, - list); - btrfs_backref_cleanup_node(cache, node); - } - } ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } @@ -3316,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - cur->cowonly = 1; + + /* We shouldn't be using backref cache for non-shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + return -EUCLEAN; + } if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ @@ -3403,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, goto out; } upper->owner = btrfs_header_owner(eb); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - upper->cowonly = 1; + + /* We shouldn't be using backref cache for non shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + btrfs_backref_free_node(cache, upper); + ret = -EUCLEAN; + goto out; + } /* * If we know the block isn't shared we can avoid @@ -3595,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - /* Insert this node to cache if it's not COW-only */ - if (!start->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, - &start->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, start->bytenr, - -EEXIST); - list_add_tail(&start->lower, &cache->leaves); - } + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); /* * Use breadth first search to iterate all related edges. @@ -3642,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - list_add_tail(&edge->list[UPPER], &upper->lower); continue; } @@ -3657,23 +3621,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - /* Sanity check, COW-only node has non-COW-only parent */ - if (start->cowonly != upper->cowonly) { - ASSERT(0); + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (unlikely(rb_node)) { + btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } - /* Only cache non-COW-only (subvolume trees) tree blocks */ - if (!upper->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) { - btrfs_backref_panic(cache->fs_info, - upper->bytenr, -EEXIST); - return -EUCLEAN; - } - } - list_add_tail(&edge->list[UPPER], &upper->lower); /* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c1..74e614031274 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -318,6 +318,12 @@ struct btrfs_backref_node { u64 bytenr; }; /* Use rb_simple_node for search/insert */ + /* + * This is a sanity check, whenever we COW a block we will update + * new_bytenr with it's current location, and we will check this in + * various places to validate that the cache makes sense, it shouldn't + * be used for anything else. + */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; @@ -335,10 +341,6 @@ struct btrfs_backref_node { struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; - /* Is the block in a non-shareable tree */ - unsigned int cowonly:1; - /* 1 if no child node is in the cache */ - unsigned int lowest:1; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ @@ -391,12 +393,6 @@ struct btrfs_backref_cache { * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; - /* List of backref nodes with no child node */ - struct list_head leaves; - /* List of blocks that have been COWed in current transaction */ - struct list_head changed; - /* List of detached backref node. */ - struct list_head detached; u64 last_trans; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 7ea6f0b43b95..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -453,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -725,8 +733,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - if (is_data_bbio(bbio) && bioc && - btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4427c1b835e8..c0a8f7d92acc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) } } +static int btrfs_bg_start_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_block_group *new_bg = + rb_entry(new, struct btrfs_block_group, cache_node); + const struct btrfs_block_group *exist_bg = + rb_entry(exist, struct btrfs_block_group, cache_node); + + if (new_bg->start < exist_bg->start) + return -1; + if (new_bg->start > exist_bg->start) + return 1; + return 0; +} + /* * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group *cache; - bool leftmost = true; + struct rb_node *exist; + int ret = 0; ASSERT(block_group->length != 0); write_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_root.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group, cache_node); - if (block_group->start < cache->start) { - p = &(*p)->rb_left; - } else if (block_group->start > cache->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color_cached(&block_group->cache_node, - &info->block_group_cache_tree, leftmost); + exist = rb_find_add_cached(&block_group->cache_node, + &info->block_group_cache_tree, btrfs_bg_start_cmp); + if (exist) + ret = -EEXIST; write_unlock(&info->block_group_cache_lock); - return 0; + return ret; } /* @@ -1223,7 +1221,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; @@ -1396,8 +1394,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - -cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -1645,8 +1642,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); + btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; @@ -2672,7 +2668,6 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3060,8 +3055,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) (cache->alloc_offset - cache->used - cache->pinned - cache->reserved) + (cache->length - cache->zone_capacity); - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - @@ -3123,7 +3117,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); /* @@ -3699,7 +3692,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; if (READ_ONCE(space_info->periodic_reclaim)) @@ -3781,8 +3774,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a07b9594dc70..3f3608299c0b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, - space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(space_info, num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, num_bytes); block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; btrfs_try_granting_tickets(fs_info, sinfo); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index aa1f55cd81b7..b2fa33911c28 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict); + bool nowait); void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 693dc27ffb89..3dc5a35dd19b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); - -static const struct btrfs_csums { - u16 size; - const char name[10]; - const char driver[12]; -} btrfs_csums[] = { - [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, - [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, - [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, -}; - /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack. @@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } -/* This exists for btrfs-progs usages. */ -u16 btrfs_csum_type_size(u16 type) -{ - return btrfs_csums[type].size; -} - -int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_type_size(t); -} - -const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].name; -} - -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; -} - -size_t __attribute_const__ btrfs_get_num_csums(void) -{ - return ARRAY_SIZE(btrfs_csums); -} - struct btrfs_path *btrfs_alloc_path(void) { might_sleep(); @@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* - * We want the transaction abort to print stack trace only for errors where the - * cause could be a bug, eg. due to ENOSPC, and not for common errors that are - * caused by external factors. - */ -bool __cold abort_should_print_stack(int error) -{ - switch (error) { - case -EIO: - case -EROFS: - case -ENOMEM: - return false; - } - return true; -} - -/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -654,6 +587,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); @@ -1566,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1609,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -3903,6 +3835,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c341956a01c..1096a80a64e7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,7 +7,6 @@ #define BTRFS_CTREE_H #include "linux/cleanup.h" -#include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> @@ -506,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sectorsize_bits) - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ - return mapping_gfp_constraint(mapping, ~__GFP_FS); -} - -void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); -int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - -/* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); @@ -756,18 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } -u16 btrfs_csum_type_size(u16 type); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - -/* - * We use folio flag owner_2 to indicate there is an ordered extent with - * unfinished IO. - */ -#define folio_test_ordered(folio) folio_test_owner_2(folio) -#define folio_set_ordered(folio) folio_set_owner_2(folio) -#define folio_clear_ordered(folio) folio_clear_owner_2(folio) - #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7aa8a395d838..88e900e5a43d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 508bdbae29a0..0b4933c6a889 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -366,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( return NULL; } +static int btrfs_delayed_item_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_delayed_item *new_item = + rb_entry(new, struct btrfs_delayed_item, rb_node); + const struct btrfs_delayed_item *exist_item = + rb_entry(exist, struct btrfs_delayed_item, rb_node); + + if (new_item->index < exist_item->index) + return -1; + if (new_item->index > exist_item->index) + return 1; + return 0; +} + static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { - struct rb_node **p, *node; - struct rb_node *parent_node = NULL; struct rb_root_cached *root; - struct btrfs_delayed_item *item; - bool leftmost = true; + struct rb_node *exist; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else root = &delayed_node->del_root; - p = &root->rb_root.rb_node; - node = &ins->rb_node; - - while (*p) { - parent_node = *p; - item = rb_entry(parent_node, struct btrfs_delayed_item, - rb_node); - - if (item->index < ins->index) { - p = &(*p)->rb_right; - leftmost = false; - } else if (item->index > ins->index) { - p = &(*p)->rb_left; - } else { - return -EEXIST; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); + if (exist) + return -EEXIST; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && ins->index >= delayed_node->index_cnt) @@ -1038,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1561,8 +1555,7 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node, +static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, u64 index) { struct btrfs_delayed_item *item; @@ -1620,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); + ret = btrfs_delete_delayed_insertion_item(node, index); if (!ret) goto end; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0d878dbbabba..98c5b61dabe8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) u64 num_bytes; u64 reserved_bytes; + if (btrfs_is_testing(fs_info)) + return; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); @@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -265,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2) +static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; @@ -279,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -static int comp_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2, +static int comp_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; @@ -314,34 +317,25 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } +static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) +{ + const struct btrfs_delayed_ref_node *new_node = + rb_entry(new, struct btrfs_delayed_ref_node, ref_node); + const struct btrfs_delayed_ref_node *exist_node = + rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); + + return comp_refs(new_node, exist_node, true); +} + static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - bool leftmost = true; - - while (*p) { - int comp; - - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - ref_node); - comp = comp_refs(ins, entry, true); - if (comp < 0) { - p = &(*p)->rb_left; - } else if (comp > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + struct rb_node *exist; - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(node, root, cmp_refs_node); + if (exist) + return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); return NULL; } @@ -555,6 +549,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, delayed_refs->num_heads_ready--; } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + lockdep_assert_held(&head->mutex); + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -1234,6 +1254,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + bool testing = btrfs_is_testing(fs_info); spin_lock(&delayed_refs->lock); while (true) { @@ -1263,7 +1284,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); - if (pin_bytes) { + if (!testing && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); @@ -1281,8 +1302,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) spin_lock(&bg->space_info->lock); spin_lock(&bg->lock); bg->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - bg->space_info, + btrfs_space_info_update_bytes_pinned(bg->space_info, head->num_bytes); bg->reserved -= head->num_bytes; bg->space_info->bytes_reserved -= head->num_bytes; @@ -1295,12 +1315,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + if (!testing) + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } - btrfs_qgroup_destroy_extent_records(trans); + + if (!testing) + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 611fb3388f82..a35067cebb97 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ac8e97ed13f7..f86fbea0b3de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,9 +440,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1ea5d8fcfbf7..ccf91de29f80 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -92,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -152,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index a7c3e221378d..8567af46e16f 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); block_start = extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, - &file_extent, false, false) == 1) { + if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eff0dd1ae62f..f09db62e61a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -226,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -1258,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -2327,6 +2328,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2495,6 +2561,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2856,6 +2924,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -3321,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a7051e2570c1..587842991b24 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -96,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3c6f7fecbb9a..3014a1a23efd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -570,7 +570,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -618,7 +617,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -1050,7 +1048,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1195,7 +1192,6 @@ static noinline_for_stack int update_inline_extent_backref( item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1260,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1527,7 +1523,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ @@ -1711,8 +1706,6 @@ again: ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -1803,30 +1796,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1959,7 +1928,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); @@ -2230,10 +2199,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -2307,7 +2277,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * then we have a cross reference. */ if (ref->ref_root != btrfs_root_id(root) || - ref_owner != objectid || ref_offset != offset) { + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2318,11 +2288,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2341,35 +2353,32 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) - goto out; + return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2377,56 +2386,69 @@ static noinline int check_committed_ref(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2571,13 +2593,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2724,15 +2743,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2778,37 +2797,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, - len); + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } @@ -3259,7 +3260,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -4827,7 +4827,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4902,7 +4901,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2ad51130c037..cfa52264f678 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -163,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b923d0cec61c..b2fae67f8fa3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping, u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; struct folio_batch fbatch; int i; @@ -221,7 +220,7 @@ static void __process_folios_contig(struct address_space *mapping, } } -static noinline void __unlock_for_delalloc(const struct inode *inode, +static noinline void unlock_delalloc_folio(const struct inode *inode, const struct folio *locked_folio, u64 start, u64 end) { @@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; u64 processed_end = start; struct folio_batch fbatch; @@ -288,8 +286,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_folio, start, - processed_end); + unlock_delalloc_folio(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -390,7 +387,7 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_folio, delalloc_start, + unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -526,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -555,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -564,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -632,7 +628,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array); + allocated = alloc_pages_bulk(gfp, nr_pages, page_array); if (unlikely(allocated == last)) { /* No progress, fail and do cleanup. */ for (int i = 0; i < allocated; i++) { @@ -710,6 +706,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; @@ -862,11 +859,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, return ret; } -int set_page_extent_mapped(struct page *page) -{ - return set_folio_extent_mapped(page_folio(page)); -} - int set_folio_extent_mapped(struct folio *folio) { struct btrfs_fs_info *fs_info; @@ -901,12 +893,11 @@ void clear_folio_extent_mapped(struct folio *folio) folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, - struct folio *folio, u64 start, - u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -922,14 +913,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); - em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); + em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); return em; } @@ -963,7 +952,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { @@ -985,8 +974,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, folio, cur, end - cur + 1, - em_cached); + em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); @@ -1087,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -1142,14 +1137,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio, } /* - * helper for extent_writepage(), doing all of the delayed allocation setup. + * Do all of the delayed allocation setup. + * + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked). * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent). In this case the IO has - * been started and the page is already unlocked. + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked. */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, @@ -1167,6 +1167,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * last delalloc end. */ u64 last_delalloc_end = 0; + /* + * The range end (exclusive) of the last successfully finished delalloc + * range. + * Any range covered by ordered extent must either be manually marked + * finished (error handling), or has IO submitted (and finish the + * ordered extent normally). + * + * This records the end of ordered extent cleanup if we hit an error. + */ + u64 last_finished_delalloc_end = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; @@ -1235,11 +1245,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean up this range during error + * handling. + */ + last_finished_delalloc_end = found_start; /* No errors hit so far, run the current delalloc range. */ ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); + if (ret >= 0) + last_finished_delalloc_end = found_start + found_len; + if (unlikely(ret < 0)) + btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", + btrfs_root_id(inode->root), + btrfs_ino(inode), + folio_pos(folio), + fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, + found_start, found_len, ret); } else { /* * We've hit an error during previous delalloc range, @@ -1247,7 +1274,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, folio, + unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } @@ -1274,8 +1301,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = found_start + found_len; } - if (ret < 0) + /* + * It's possible we had some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished_delalloc_end - page_start) >> + fs_info->sectorsize_bits, + fs_info->sectors_per_page); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); return ret; + } out: if (last_delalloc_end) delalloc_end = last_delalloc_end; @@ -1335,7 +1376,7 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) - return PTR_ERR_OR_ZERO(em); + return PTR_ERR(em); extent_offset = filepos - em->start; em_end = extent_map_end(em); @@ -1391,6 +1432,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; + bool error = false; const u64 folio_start = folio_pos(folio); u64 cur; int bit; @@ -1433,11 +1475,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; + continue; + } submitted_io = true; } -out: + /* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared @@ -1445,8 +1502,11 @@ out: * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - if (!submitted_io) { + if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } @@ -1464,15 +1524,14 @@ out: */ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - const u64 page_start = folio_pos(folio); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; size_t pg_offset; - loff_t i_size = i_size_read(inode); + loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace_extent_writepage(folio, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); WARN_ON(!folio_test_locked(folio)); @@ -1496,26 +1555,28 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); + ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + ret = extent_writepage_io(inode, folio, folio_pos(folio), PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; + if (ret < 0) + btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(folio), fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; done: - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - page_start, PAGE_SIZE, !ret); + if (ret < 0) mapping_set_error(folio->mapping, ret); - } - /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. @@ -1525,12 +1586,6 @@ done: return ret; } -void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, - TASK_UNINTERRUPTIBLE); -} - /* * Lock extent buffer status and pages for writeback. * @@ -1671,11 +1726,10 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; - bool uptodate = !bbio->bio.bi_status; struct folio_iter fi; u32 bio_offset = 0; - if (!uptodate) + if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { @@ -2292,11 +2346,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (ret == 1) goto next_page; - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - cur, cur_len, !ret); + if (ret) mapping_set_error(mapping, ret); - } btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; @@ -2332,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); @@ -2495,11 +2554,6 @@ next: return try_release_extent_state(io_tree, folio); } -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); -} - static int extent_buffer_under_io(const struct extent_buffer *eb) { return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -2580,8 +2634,8 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo spin_unlock(&folio->mapping->i_private_lock); } -/* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) +/* Release all folios attached to the extent buffer */ +static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) { ASSERT(!extent_buffer_under_io(eb)); @@ -2603,9 +2657,9 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_pages(eb); + btrfs_release_extent_buffer_folios(eb); btrfs_leak_debug_del_eb(eb); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static struct extent_buffer * @@ -2703,7 +2757,7 @@ err: folio_put(eb->folios[i]); } } - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return NULL; } @@ -2830,13 +2884,12 @@ free_eb: } #endif -static struct extent_buffer *grab_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page) +static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, + struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *exists; - lockdep_assert_held(&page->mapping->i_private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); /* * For subpage case, we completely rely on radix tree to ensure we @@ -2851,7 +2904,7 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* - * We could have already allocated an eb for this page and attached one + * We could have already allocated an eb for this folio and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can * just overwrite folio private. @@ -2860,16 +2913,19 @@ static struct extent_buffer *grab_extent_buffer( if (atomic_inc_not_zero(&exists->refs)) return exists; - WARN_ON(PageDirty(page)); + WARN_ON(folio_test_dirty(folio)); folio_detach_private(folio); return NULL; } -static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +/* + * Validate alignment constraints of eb at logical address @start. + */ +static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); - return -EINVAL; + return true; } if (fs_info->nodesize < PAGE_SIZE && @@ -2877,14 +2933,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) btrfs_err(fs_info, "tree block crosses page boundary, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (fs_info->nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (!IS_ALIGNED(start, fs_info->nodesize) && !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { @@ -2892,10 +2948,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", start, fs_info->nodesize); } - return 0; + return false; } - /* * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, @@ -2951,8 +3006,7 @@ finish: } else if (existing_folio) { struct extent_buffer *existing_eb; - existing_eb = grab_extent_buffer(fs_info, - folio_page(existing_folio, 0)); + existing_eb = grab_extent_buffer(fs_info, existing_folio); if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; @@ -3149,7 +3203,7 @@ again: * live buffer and won't free them prematurely. */ for (int i = 0; i < num_folios; i++) - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); return eb; out: @@ -3173,7 +3227,7 @@ out: for (int i = 0; i < attached; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); folio_put(eb->folios[i]); eb->folios[i] = NULL; } @@ -3195,7 +3249,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) struct extent_buffer *eb = container_of(head, struct extent_buffer, rcu_head); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static int release_extent_buffer(struct extent_buffer *eb) @@ -3219,11 +3273,11 @@ static int release_extent_buffer(struct extent_buffer *eb) } btrfs_leak_debug_del_eb(eb); - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_pages(eb); + /* Should be safe to release folios at this point. */ + btrfs_release_extent_buffer_folios(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return 1; } #endif @@ -3382,12 +3436,12 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(folio_page(eb->folios[0], 0)); + folio_lock(eb->folios[0]); for (int i = 0; i < num_folios; i++) btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], eb->start, eb->len); if (subpage) - unlock_page(folio_page(eb->folios[0], 0)); + folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); @@ -3497,8 +3551,8 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_put(&bbio->bio); } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - const struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; bool ret; @@ -3516,7 +3570,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, /* Someone else is already reading the buffer, just wait for it. */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) - goto done; + return 0; /* * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above @@ -3556,14 +3610,21 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, } } btrfs_submit_bbio(bbio, mirror_num); + return 0; +} -done: - if (wait == WAIT_COMPLETE) { - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - } +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) +{ + int ret; + ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); + if (ret < 0) + return ret; + + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return -EIO; return 0; } @@ -4294,7 +4355,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); + ret = read_extent_buffer_pages_nowait(eb, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8a36117ed453..6c5328bfabc2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,7 +248,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); -int set_page_extent_mapped(struct page *page); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -262,12 +261,17 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE 0 -#define WAIT_COMPLETE 1 -#define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *parent_check); -void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *parent_check); + +static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..7f46abbd6311 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,12 +1187,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 886749b39672..d04a3b47b1fb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -190,8 +190,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -1259,7 +1257,6 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 14e27473c5bc..0b568c8d24cb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,52 +36,7 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* - * Helper to fault in page and copy. This should go away and be replaced with - * calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct folio *folio, struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); - /* - * Copy data from userspace to the current page - */ - copied = copy_folio_from_iter_atomic(folio, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_folio(folio); - - /* - * if we get a partial write, we can end up with - * partially up to date page. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - } - return total_copied; -} +#include "print-tree.h" /* * Unlock folio after btrfs_file_write() is done with it. @@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, } /* - * After btrfs_copy_from_user(), update the following things for delalloc: + * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup @@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); @@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -321,7 +280,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -351,7 +314,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { @@ -397,7 +359,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -409,7 +370,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -417,7 +382,6 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -437,7 +401,11 @@ delete_extent_item: del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -668,7 +636,6 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -697,7 +664,6 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -731,7 +697,6 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(trans, leaf); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = bytenr; @@ -810,7 +775,6 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -819,7 +783,6 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -1052,7 +1015,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait, false); + NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1076,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1103,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); @@ -1129,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) u64 lockend; size_t num_written = 0; ssize_t ret; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1142,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; @@ -1252,7 +1220,23 @@ again: break; } - copied = btrfs_copy_from_user(pos, write_bytes, folio, i); + copied = copy_folio_from_iter_atomic(folio, + offset_in_folio(folio, pos), write_bytes, i); + flush_dcache_folio(folio); + + /* + * If we get a partial write, we can end up with partially + * uptodate page. Although if sector size < page size we can + * handle it, but if it's not sector aligned it can cause + * a lot of complexity, so make sure they don't happen by + * forcing retry this copy. + */ + if (unlikely(copied < write_bytes)) { + if (!folio_test_uptodate(folio)) { + iov_iter_revert(i, copied); + copied = 0; + } + } num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, @@ -2029,7 +2013,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2046,7 +2029,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2194,7 +2176,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfa52ef40b06..d42b6f882f57 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -12,7 +12,7 @@ #include <linux/error-injection.h> #include <linux/sched/mm.h> #include <linux/string_choices.h> -#include "ctree.h" +#include "extent-tree.h" #include "fs.h" #include "messages.h" #include "misc.h" @@ -198,7 +198,6 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -463,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) return -ENOMEM; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(page_folio(page)); if (ret < 0) { unlock_page(page); put_page(page); @@ -1189,7 +1187,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7ba50e133921..cae540ec15ed 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -1350,6 +1345,12 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_end_transaction(trans); return ret; } + if (btrfs_should_end_transaction(trans)) { + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } node = rb_next(node); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 31c1648bc0b4..09cfb43580cb 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -4,6 +4,136 @@ #include "ctree.h" #include "fs.h" #include "accessors.h" +#include "volumes.h" + +static const struct btrfs_csums { + u16 size; + const char name[10]; + const char driver[12]; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + + /* csum type is validated at mount time. */ + return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time. */ + return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name. + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : + btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +/* + * Start exclusive operation @type, return true on success. + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock() + * and btrfs_exclop_finish(). + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->super_lock); + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79a1a3d6f04d..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -14,10 +14,10 @@ #include <linux/lockdep.h> #include <linux/spinlock.h> #include <linux/mutex.h> -#include <linux/rwlock_types.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/list.h> +#include <linux/pagemap.h> #include <linux/radix-tree.h> #include <linux/workqueue.h> #include <linux/wait.h> @@ -627,6 +627,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -887,6 +890,11 @@ struct btrfs_fs_info { #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ struct inode *: (_inode)))->root->fs_info) +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_constraint(mapping, ~__GFP_FS); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -953,6 +961,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) + static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; @@ -982,6 +992,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +static inline bool btrfs_is_empty_uuid(const u8 *uuid) +{ + return uuid_is_null((const uuid_t *)uuid); +} + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -1058,6 +1079,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +/* + * We use folio flag owner_2 to indicate there is an ordered extent with + * unfinished IO. + */ +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 29572dfaf878..448aa1a682d6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -298,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); return ret; @@ -363,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); @@ -590,7 +586,6 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 488edca8333a..a9e56c994e9e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; struct folio *folio; - if (locked_folio) { - page_start = folio_pos(locked_folio); - page_end = page_start + folio_size(locked_folio) - 1; - } - while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_folio && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) @@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_folio) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + folio_size(locked_folio)) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - folio_pos(locked_folio) - - folio_size(locked_folio); - offset = folio_pos(locked_folio) + folio_size(locked_folio); - } - } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } @@ -564,7 +526,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -1129,19 +1090,14 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_folio, - start, end - start + 1); - if (locked_folio) { - const u64 page_start = folio_pos(locked_folio); - - folio_start_writeback(locked_folio); - folio_end_writeback(locked_folio); - btrfs_mark_ordered_io_finished(inode, locked_folio, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_folio->mapping, ret); - folio_unlock(locked_folio); - } + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1373,6 +1329,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the @@ -1415,8 +1382,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1431,6 +1403,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); @@ -1476,21 +1452,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered flag so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else @@ -1507,6 +1468,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; @@ -1527,35 +1490,30 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1589,6 +1547,10 @@ out_unlock: btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1809,7 +1771,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) @@ -1837,7 +1799,6 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. @@ -1892,8 +1853,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; @@ -1922,9 +1882,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->file_extent.offset, - args->file_extent.disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1971,6 +1930,53 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1985,6 +1991,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2145,6 +2156,7 @@ must_cow: found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } @@ -2218,11 +2230,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); @@ -2230,12 +2243,41 @@ must_cow: error: /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_offset cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_offset) it's the same as case 1). + * But for range [cur_offset, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * + * Thus we should not call extent_clear_unlock_delalloc() on range + * [cur_offset, cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (cow_end) + cur_offset = cow_end + 1; /* * We need to lock the extent here because we're clearing DELALLOC and @@ -2255,6 +2297,10 @@ error: btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2305,8 +2351,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_folio, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2921,7 +2966,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -4085,7 +4129,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -6380,7 +6423,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6702,18 +6744,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -7011,8 +7053,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7024,7 +7064,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict) + bool nowait) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7077,7 +7117,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); @@ -8027,31 +8066,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -8287,16 +8340,23 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -8304,18 +8364,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -8655,7 +8724,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9078,9 +9146,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion done; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9099,14 +9167,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(&priv->done); } } bio_put(&bbio->bio); @@ -9126,8 +9194,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!priv) return -ENOMEM; - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9208,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9223,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,8 +9236,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); /* See btrfs_encoded_read_endio() for ordering. */ ret = blk_status_to_errno(READ_ONCE(priv->status)); kfree(priv); @@ -9799,15 +9867,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the @@ -9816,22 +9894,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -9846,7 +9934,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9860,7 +9949,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9881,7 +9971,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -9889,24 +9980,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9918,23 +10024,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9969,7 +10097,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10010,20 +10137,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10036,6 +10166,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10044,7 +10178,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3af8bb0c8d75..6c18bad53cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -403,86 +403,6 @@ update_flags: return ret; } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - bool ret = false; - - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { - fs_info->exclusive_operation = type; - ret = true; - } - spin_unlock(&fs_info->super_lock); - - return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one. This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - * must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type || - (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && - type == BTRFS_EXCLOP_DEV_ADD)) - return true; - - spin_unlock(&fs_info->super_lock); - return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ - spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ - spin_lock(&fs_info->super_lock); - WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); - spin_unlock(&fs_info->super_lock); - sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op) -{ - switch (op) { - case BTRFS_EXCLOP_BALANCE_PAUSED: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || - fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || - fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; - spin_unlock(&fs_info->super_lock); - break; - case BTRFS_EXCLOP_BALANCE: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; - spin_unlock(&fs_info->super_lock); - break; - default: - btrfs_warn(fs_info, - "invalid exclop balance operation %d requested", op); - } -} - static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); @@ -551,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return ret; } -int __pure btrfs_is_empty_uuid(const u8 *uuid) -{ - int i; - - for (i = 0; i < BTRFS_UUID_SIZE; i++) { - if (uuid[i]) - return 0; - } - return 1; -} - /* * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. @@ -2635,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* + * Don't allow defrag on pre-content watched files, as it could + * populate the page cache with 0's via readahead. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + ret = -EINVAL; + goto out; + } + if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; @@ -3007,7 +2925,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -4878,25 +4795,29 @@ out_fail: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; - struct btrfs_ioctl_encoded_io_args args = { 0 }; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - struct iov_iter iter; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4910,43 +4831,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_encoded_io_args_32 args32; - copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); - if (copy_from_user(&args32, sqe_addr, copy_end)) { - ret = -EFAULT; - goto out_acct; - } - args.iov = compat_ptr(args32.iov); - args.iovcnt = args32.iovcnt; - args.offset = args32.offset; - args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; - if (copy_from_user(&args, sqe_addr, copy_end)) { - ret = -EFAULT; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; goto out_acct; } - } - if (args.flags != 0) - return -EINVAL; + io_uring_cmd_get_async_data(cmd)->op_data = data; - ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), - &iov, &iter); - if (ret < 0) - goto out_acct; + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; - if (iov_iter_count(&iter) == 0) { - ret = 0; - goto out_free; + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } } - pos = args.offset; - ret = rw_verify_area(READ, file, &pos, args.len); + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; @@ -4959,15 +4901,16 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); - if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, - sizeof(args) - copy_end_kernel)) { + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -4977,40 +4920,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue } if (ret == -EIOCBQUEUED) { - u64 count; - - /* - * If we've optimized things by storing the iovecs on the stack, - * undo this. - */ - if (!iov) { - iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); - if (!iov) { - unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - ret = -ENOMEM; - goto out_acct; - } - - memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); - } - - count = min_t(u64, iov_iter_count(&iter), disk_io_size); + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ - if (!args.compression) - count = min_t(u64, count, args.len); + if (!data->args.compression) + count = min_t(u64, count, data->args.len); - ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, - cached_state, disk_bytenr, - disk_io_size, count, - args.compression, iov, cmd); + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); goto out_acct; } out_free: - kfree(iov); + kfree(data->iov); out_acct: if (ret > 0) @@ -5020,6 +4945,128 @@ out_acct: return ret; } +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + loff_t pos; + struct kiocb kiocb; + struct file *file; + ssize_t ret; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + file = cmd->file; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; + data->args.len = args32.len; + data->args.unencoded_len = args32.unencoded_len; + data->args.unencoded_offset = args32.unencoded_offset; + data->args.compression = args32.compression; + data->args.encryption = args32.encryption; + memcpy(data->args.reserved, args32.reserved, + sizeof(data->args.reserved)); +#else + ret = -ENOTTY; + goto out_acct; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (data->args.flags != 0) + goto out_acct; + if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) + goto out_acct; + if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (data->args.unencoded_offset > data->args.unencoded_len) + goto out_acct; + if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) + goto out_acct; + + data->iov = data->iovstack; + ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_iov; + } + } + + if (issue_flags & IO_URING_F_NONBLOCK) { + ret = -EAGAIN; + goto out_acct; + } + + pos = data->args.offset; + ret = rw_verify_area(WRITE, file, &pos, data->args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); + if (ret) + goto out_iov; + kiocb.ki_pos = pos; + + file_start_write(file); + + ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); + if (ret > 0) + fsnotify_modify(file); + + file_end_write(file); +out_iov: + kfree(data->iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { switch (cmd->cmd_op) { @@ -5028,6 +5075,12 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) case BTRFS_IOC_ENCODED_READ_32: #endif return btrfs_uring_encoded_read(cmd, issue_flags); + + case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_WRITE_32: +#endif + return btrfs_uring_encoded_write(cmd, issue_flags); } return -EINVAL; @@ -5300,6 +5353,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 2b760c8778f8..ce915fcda43b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -19,7 +19,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 35036b151bf5..c69e57ff804b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { lockdep_assert_held_write(&eb->lock); } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_read(&eb->lock); +} #else static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7..4aca7475fd82 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a6f92836c9b1..f9d3766c809b 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -673,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - btrfs_free_path(path); return ret; } @@ -752,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; @@ -771,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(trans, leaf); - ret = 0; out: btrfs_free_path(path); @@ -859,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -905,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -947,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -1121,6 +1105,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1129,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(trans, leaf); - key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; @@ -1254,8 +1237,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ @@ -1839,9 +1820,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode. */ - WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. @@ -1850,8 +1841,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { - if (WARN_ON(qgroup->rfer || qgroup->excl || - qgroup->rfer_cmpr || qgroup->excl_cmpr)) { + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -1888,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; @@ -1905,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9ffc79f250fb..1834011ccc49 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,12 +13,13 @@ #include "volumes.h" #include "print-tree.h" -static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *oldkey, u64 newlen, u64 frontpad) { - struct btrfs_stripe_extent *extent; + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; struct extent_buffer *leaf; int slot; size_t item_size; @@ -27,23 +28,39 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, .type = BTRFS_RAID_STRIPE_KEY, .offset = newlen, }; + int ret; + ASSERT(newlen > 0); ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { struct btrfs_raid_stride *stride = &extent->strides[i]; u64 phys; - phys = btrfs_raid_stride_physical(leaf, stride); - btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); } - btrfs_set_item_key_safe(trans, path, &newkey); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -59,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le int slot; int ret; - if (!stripe_root) + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root) return 0; + if (!btrfs_is_testing(fs_info)) { + struct btrfs_chunk_map *map; + bool use_rst; + + map = btrfs_find_chunk_map(fs_info, start, length); + if (!map) + return -EINVAL; + use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + btrfs_free_chunk_map(map); + if (!use_rst) + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -85,6 +115,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le found_end = found_start + key.offset; ret = 0; + /* + * The stripe extent starts before the range we want to delete, + * but the range spans more than one stripe extent: + * + * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to get the previous item, truncate its + * length and then restart the search. + */ + if (found_start > start) { + if (slot == 0) { + ret = btrfs_previous_item(stripe_root, path, start, + BTRFS_RAID_STRIPE_KEY); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + } else { + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + ASSERT(found_start <= start); + } + if (key.type != BTRFS_RAID_STRIPE_KEY) break; @@ -96,6 +157,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le found_start, found_end); /* + * The stripe extent starts before the range we want to delete + * and ends after the range we want to delete, i.e. we're + * punching a hole in the stripe extent: + * + * |--- RAID Stripe Extent ---| + * | keep |--- drop ---| keep | + * + * This means we need to a) truncate the existing item and b) + * create a second item for the remaining range. + */ + if (found_start < start && found_end > end) { + size_t item_size; + u64 diff_start = start - found_start; + u64 diff_end = found_end - end; + struct btrfs_stripe_extent *extent; + struct btrfs_key newkey = { + .objectid = end, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = diff_end, + }; + + /* The "right" item. */ + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + break; + + item_size = btrfs_item_size(leaf, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + phys += diff_start + length; + btrfs_set_raid_stride_physical(leaf, stride, phys); + } + + /* The "left" item. */ + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + break; + } + + /* * The stripe extent starts before the range we want to delete: * * |--- RAID Stripe Extent ---| @@ -105,11 +214,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_start < start) { - u64 diff = start - found_start; + u64 diff_start = start - found_start; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, 0); - break; + diff_start, 0); + + start += (key.offset - diff_start); + length -= (key.offset - diff_start); + if (length == 0) + break; + + btrfs_release_path(path); + continue; } /* @@ -122,13 +238,16 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le * length to the new size and then re-insert the item. */ if (found_end > end) { - u64 diff = found_end - end; + u64 diff_end = found_end - end; btrfs_partially_delete_raid_extent(trans, path, &key, - diff, diff); + key.offset - length, + length); + ASSERT(key.offset - diff_end == length); break; } + /* Finally we can delete the whole item, no more special cases. */ ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -169,7 +288,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return ret; @@ -199,12 +317,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, for (int i = 0; i < num_stripes; i++) { u64 devid = bioc->stripes[i].dev->devid; u64 physical = bioc->stripes[i].physical; - u64 length = bioc->stripes[i].length; struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; - if (length == 0) - length = bioc->size; - btrfs_set_stack_raid_stride_devid(raid_stride, devid); btrfs_set_stack_raid_stride_physical(raid_stride, physical); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index bf267bdfa8f8..af0969b70b53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, if (cur == node) ret = true; - /* The node is the lowest node */ - if (cur->lowest) { - list_del_init(&cur->lower); - cur->lowest = 0; - } - /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; @@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { - list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); @@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( goto out; } - node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ @@ -470,92 +462,6 @@ out: } /* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - const struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct btrfs_backref_cache *cache = &rc->backref_cache; - struct btrfs_backref_node *node = NULL; - struct btrfs_backref_node *new_node; - struct btrfs_backref_edge *edge; - struct btrfs_backref_edge *new_edge; - struct rb_node *rb_node; - - rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = rb_simple_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = btrfs_backref_alloc_node(cache, dest->node->start, - node->level); - if (!new_node) - return -ENOMEM; - - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = btrfs_grab_root(dest); - ASSERT(new_node->root); - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = btrfs_backref_alloc_edge(cache); - if (!new_edge) - goto fail; - - btrfs_backref_link_edge(new_edge, edge->node[LOWER], - new_node, LINK_UPPER); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct btrfs_backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - btrfs_backref_free_edge(cache, new_edge); - } - btrfs_backref_free_node(cache, new_node); - return -ENOMEM; -} - -/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -950,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u32 i; int ret = 0; int first = 1; - int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; @@ -1030,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); ref.action = BTRFS_ADD_DELAYED_REF; @@ -1061,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } } - if (dirty) - btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(inode); return ret; @@ -1255,13 +1157,11 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(trans, path->nodes[level]); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = old_bytenr; @@ -2058,100 +1958,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, int index = 0; int ret; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; + next = walk_up_backref(node, edges, &index); + root = next->root; - /* - * If there is no root, then our references for this block are - * incomplete, as we should be able to walk all the way up to a - * block that is owned by a root. - * - * This path is only for SHAREABLE roots, so if we come upon a - * non-SHAREABLE root then we have backrefs that resolve - * improperly. - * - * Both of these cases indicate file system corruption, or a bug - * in the backref walking code. - */ - if (!root) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu doesn't have a backref path ending in a root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu has multiple refs with one ending in a non-shareable root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = record_reloc_root_in_trans(trans, root); - if (ret) - return ERR_PTR(ret); - break; - } + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a block + * that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve improperly. + * + * Both of these cases indicate file system corruption, or a bug in the + * backref walking code. + */ + if (unlikely(!root)) { + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } - ret = btrfs_record_root_in_trans(trans, root); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); - root = root->reloc_root; - - /* - * We could have raced with another thread which failed, so - * root->reloc_root may not be set, return ENOENT in this case. - */ - if (!root) - return ERR_PTR(-ENOENT); + goto found; + } - if (next->new_bytenr != root->node->start) { - /* - * We just created the reloc root, so we shouldn't have - * ->new_bytenr set and this shouldn't be in the changed - * list. If it is then we have multiple roots pointing - * at the same bytenr which indicates corruption, or - * we've made a mistake in the backref walking code. - */ - ASSERT(next->new_bytenr == 0); - ASSERT(list_empty(&next->list)); - if (next->new_bytenr || !list_empty(&next->list)) { - btrfs_err(trans->fs_info, - "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", - node->bytenr, next->bytenr); - return ERR_PTR(-EUCLEAN); - } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; - next->new_bytenr = root->node->start; - btrfs_put_root(next->root); - next->root = btrfs_grab_root(root); - ASSERT(next->root); - list_add_tail(&next->list, - &rc->backref_cache.changed); - mark_block_processed(rc, next); - break; - } + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) { + if (next->new_bytenr) { /* - * This can happen if there's fs corruption or if there's a bug - * in the backref lookup code. + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set yet. If it is then we have multiple roots + * pointing at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. */ - ASSERT(0); - return ERR_PTR(-ENOENT); + ASSERT(next->new_bytenr == 0); + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); } + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + mark_block_processed(rc, next); +found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2247,17 +2119,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, return num_bytes; } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, u64 num_bytes) { - struct btrfs_root *root = rc->extent_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 num_bytes; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - u64 tmp; - - num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2270,7 +2136,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2288,6 +2155,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, return 0; } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u64 num_bytes; + + num_bytes = calcu_metadata_size(rc, node) * 2; + return refill_metadata_space(trans, rc, num_bytes); +} + /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2442,7 +2319,7 @@ next: if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); + list_del_init(&node->list); node->pending = 0; } @@ -2605,8 +2482,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it - * should not have had the ->new_bytenr modified and - * should have not been included on the changed list. + * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, @@ -2616,8 +2492,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - ASSERT(list_empty(&node->list)); - if (node->new_bytenr || !list_empty(&node->list)) { + if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2640,17 +2515,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); - list_add_tail(&node->list, &rc->backref_cache.changed); } else { - path->lowest_level = node->level; - if (root == root->fs_info->chunk_root) - btrfs_reserve_chunk_metadata(trans, false); - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (root == root->fs_info->chunk_root) - btrfs_trans_release_chunk_metadata(trans); - if (ret > 0) - ret = 0; + btrfs_err(root->fs_info, + "bytenr %llu resolved to a non-shareable root", + node->bytenr); + ret = -EUCLEAN; + goto out; } if (!ret) update_processed_blocks(rc, node); @@ -2658,11 +2528,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) + if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct tree_block *block, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + u64 num_bytes; + int nr_levels; + int ret; + + root = btrfs_get_fs_root(fs_info, block->owner, true); + if (IS_ERR(root)) + return PTR_ERR(root); + + nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + + num_bytes = fs_info->nodesize * nr_levels; + ret = refill_metadata_space(trans, rc, num_bytes); + if (ret) { + btrfs_put_root(root); + return ret; + } + path->lowest_level = block->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); + path->lowest_level = 0; + btrfs_release_path(path); + + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + btrfs_put_root(root); + + return ret; +} + /* * relocate a list of blocks */ @@ -2702,6 +2611,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + /* + * For COWonly blocks, or the data reloc tree, we only need to + * COW down to the block, there's no need to generate a backref + * tree. + */ + if (block->owner && + (!is_fstree(block->owner) || + block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + ret = relocate_cowonly_block(trans, rc, block, path); + if (ret) + break; + continue; + } + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -2902,6 +2825,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,11 +2861,16 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* * We could have lost folio private when we dropped the lock to read the - * folio above, make sure we set_page_extent_mapped here so we have any + * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); @@ -3793,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -4399,8 +4327,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); @@ -4500,10 +4438,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; + return 0; } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 33962671a96c..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -447,7 +446,6 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 204c928beaf9..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7254279c3cc9..f437138fefbc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, @@ -7253,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { - int ret = 0; + int ret; /* * We can not hold the commit root semaphore here. This is because in @@ -7313,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path, return 0; } result = BTRFS_COMPARE_TREE_CHANGED; - ret = 0; } sctx->left_path = left_path; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 255e85f78313..a341d087567a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -14,6 +14,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -127,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -489,9 +498,7 @@ again: if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ ret = btrfs_commit_current_transaction(root); break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); + break; default: ret = -ENOSPC; break; @@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); } /* @@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; @@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) do_reclaim_sweep(space_info, raid); } } + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; + } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index efbecc0c5258..a96efdb5e681 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_EMERGENCY, }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, @@ -91,6 +95,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, + RESET_ZONES = 12, }; struct btrfs_space_info { @@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ @@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( - struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); + btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, @@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8c68059ac1b0..722acf768396 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ +} + +#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ +{ \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned long bitmap; \ + \ + GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + btrfs_warn(fs_info, \ + "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + start, len, folio_pos(folio), \ + fs_info->sectors_per_page, &bitmap); \ +} + /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. @@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, subpage = folio_get_private(folio); ASSERT(subpage); spin_lock_irqsave(&subpage->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -689,23 +715,16 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, nbits = len >> fs_info->sectorsize_bits; spin_lock_irqsave(&subpage->lock, flags); /* Target range should not yet be locked. */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ -{ \ - const int sectors_per_page = fs_info->sectors_per_page; \ - \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ -} - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { @@ -716,6 +735,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -728,15 +748,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), sectors_per_page, &uptodate_bitmap, sectors_per_page, &dirty_bitmap, + sectors_per_page, &locked_bitmap, sectors_per_page, &writeback_bitmap, sectors_per_page, &ordered_bitmap, sectors_per_page, &checked_bitmap); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 428fa9389fd4..44fff1f4eac4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -137,6 +137,19 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, + struct folio *locked_folio, + u64 start, u32 len) +{ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} + bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7dfe5005129a..dc4fee519ca6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; - sb->s_iflags |= SB_I_CGROUPWB; + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; err = super_setup_bdi(sb); if (err) { @@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb, err = open_ctree(sb, fs_devices); if (err) { - btrfs_err(fs_info, "open_ctree failed"); + btrfs_err(fs_info, "open_ctree failed: %d", err); return err; } @@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2466,7 +2469,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } @@ -2524,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fdcbf650ac31..14f53f757555 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -1305,7 +1305,73 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", + "devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +{ + char param[32]; + char __maybe_unused *value_str; + + if (!str || strlen(str) == 0) + return 0; + + strscpy(param, str); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + int ret; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + ret = kstrtos64(value_str, 10, value_ret); + if (ret) + return -EINVAL; + if (*value_ret < 0) + return -ERANGE; + } +#endif + + return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1316,14 +1382,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); @@ -1336,21 +1413,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; + s64 value = -1; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); + index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; } - return len; + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + } + + fs_devices->collect_fs_stats = true; + + return len; + } + + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } +#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); } - return -EINVAL; + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e607b5d52fb1..5eff8d7d2360 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -30,6 +30,7 @@ const char *test_error[] = { [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", + [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction", }; static const struct super_operations btrfs_test_super_ops = { @@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + + /* CRC32C csum size. */ + fs_info->csum_size = 4; + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / + fs_info->csum_size; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) kfree(cache); } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->fs_info = fs_info; + xa_init(&trans->delayed_refs.head_refs); + xa_init(&trans->delayed_refs.dirty_extents); + spin_lock_init(&trans->delayed_refs.lock); +} + void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_delayed_refs(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b524ecf2f452..4307bdaa6749 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@ #ifndef BTRFS_TESTS_H #define BTRFS_TESTS_H +#include <linux/types.h> + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); @@ -25,12 +27,14 @@ enum { TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, TEST_ALLOC_IO_CONTEXT, + TEST_ALLOC_TRANSACTION, }; extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_transaction; int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 000000000000..265370e79a54 --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1016 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + int total_ref_mod; + int must_insert; +}; + +struct ref_node_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + enum btrfs_delayed_ref_action action; + u8 type; + u64 parent; + u64 root; + u64 owner; + u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ + if ((type == BTRFS_TREE_BLOCK_REF_KEY) || + (type == BTRFS_SHARED_BLOCK_REF_KEY)) + return BTRFS_REF_METADATA; + return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *node) +{ + rb_erase_cached(&node->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&node->ref_node); + if (!list_empty(&node->add_list)) + list_del_init(&node->add_list); + btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, + struct ref_head_check *check) +{ + if (head->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", head->bytenr, + check->bytenr); + return -EINVAL; + } + + if (head->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + head->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (head->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", head->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (head->total_ref_mod != check->total_ref_mod) { + test_err("invalid total_ref_mod have: %d want: %d", + head->total_ref_mod, check->total_ref_mod); + return -EINVAL; + } + + if (head->must_insert_reserved != check->must_insert) { + test_err("invalid must_insert have: %d want: %d", + head->must_insert_reserved, check->must_insert); + return -EINVAL; + } + + return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, + struct ref_node_check *check) +{ + if (node->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", node->bytenr, + check->bytenr); + return -EINVAL; + } + + if (node->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + node->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (node->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", node->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (node->action != check->action) { + test_err("invalid action have: %d want: %d", node->action, + check->action); + return -EINVAL; + } + + if (node->parent != check->parent) { + test_err("invalid parent have: %llu want: %llu", node->parent, + check->parent); + return -EINVAL; + } + + if (node->ref_root != check->root) { + test_err("invalid root have: %llu want: %llu", node->ref_root, + check->root); + return -EINVAL; + } + + if (node->type != check->type) { + test_err("invalid type have: %d want: %d", node->type, + check->type); + return -EINVAL; + } + + if (btrfs_delayed_ref_owner(node) != check->owner) { + test_err("invalid owner have: %llu want: %llu", + btrfs_delayed_ref_owner(node), check->owner); + return -EINVAL; + } + + if (btrfs_delayed_ref_offset(node) != check->offset) { + test_err("invalid offset have: %llu want: %llu", + btrfs_delayed_ref_offset(node), check->offset); + return -EINVAL; + } + + return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, + struct ref_head_check *head_check, + struct ref_node_check *node_check) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = ref_type_from_disk_ref_type(node_check->type), + .action = node_check->action, + .parent = node_check->parent, + .ref_root = node_check->root, + .bytenr = node_check->bytenr, + .num_bytes = fs_info->nodesize, + }; + int ret; + + if (ref.type == BTRFS_REF_METADATA) + btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, + false); + else + btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, + node_check->root, true); + + if (ref.type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + return -EINVAL; + } + + ret = -EINVAL; + if (validate_ref_head(head, head_check)) + goto out; + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, node_check)) + goto out; + ret = 0; +out: + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .total_ref_mod = 1, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + .owner = FAKE_LEVEL, + .offset = 0, + }; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared data failed"); + return -EINVAL; + } + + head_check.ref_mod = -1; + head_check.total_ref_mod = -1; + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + node_check.parent = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared data failed"); + return -EINVAL; + } + + return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, + enum btrfs_ref_type type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = type, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 2, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + }; + int ret; + + /* + * First add a ref and then drop it, make sure we get a head ref with a + * 0 total ref mod and no nodes. + */ + if (type == BTRFS_REF_METADATA) { + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + } else { + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, + FAKE_ROOT_OBJECTID, true); + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("single add and drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a ref, then add another ref, make sure we get a head ref with a + * 2 total ref mod and 1 node. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double add failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add two drop refs, make sure they are merged properly. */ + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add multiple refs, then drop until we go negative again. */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Drop multiple refs, then add until we go positive again. */ + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop to positive failed"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a bunch of refs with different roots and parents, then drop them + * all, make sure everything is properly merged. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 0; + head_check.total_ref_mod = 0; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop multiple failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + ret = 0; +out: + if (!IS_ERR_OR_NULL(head)) + btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_DROP_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .owner = FAKE_LEVEL, + .offset = 0, + }; + int ret; + + /* Add the drop first. */ + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + /* + * Now add the add, and make it a different root so it's logically later + * in the rb tree. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.root = FAKE_ROOT_OBJECTID + 1; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Now we're going to do the same thing, but we're going to have an add + * that gets deleted because of a merge, and make sure we still have + * another add in place. + */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 2; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID + 2; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + ret = 0; +out: + if (head) + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ + struct btrfs_transaction *transaction; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + int ret; + + test_msg("running delayed refs tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); + if (!transaction) { + test_std_err(TEST_ALLOC_TRANSACTION); + ret = -ENOMEM; + goto out_free_fs_info; + } + btrfs_init_dummy_trans(&trans, fs_info); + btrfs_init_dummy_transaction(transaction, fs_info); + trans.transaction = transaction; + + ret = simple_tests(&trans); + if (!ret) { + test_msg("running delayed refs merg tests on metadata refs"); + ret = merge_tests(&trans, BTRFS_REF_METADATA); + } + + if (!ret) { + test_msg("running delayed refs merg tests on data refs"); + ret = merge_tests(&trans, BTRFS_REF_DATA); + } + + if (!ret) + ret = select_delayed_refs_test(&trans); + + kfree(transaction); +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c index 30f17eb7b6a8..a7bc58a5c1e2 100644 --- a/fs/btrfs/tests/raid-stripe-tree-tests.c +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -14,6 +14,8 @@ #define RST_TEST_NUM_DEVICES (2) #define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) +#define SZ_48K (SZ_32K + SZ_16K) + typedef int (*test_func_t)(struct btrfs_trans_handle *trans); static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, @@ -30,6 +32,613 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de } /* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 hole_start = logical1 + SZ_32K; + u64 hole_len = SZ_64K; + u64 logical2 = hole_start + hole_len; + u64 len = SZ_1M; + u64 len1 = SZ_32K; + u64 len2 = len - len1 - hole_len; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_1M) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_1M, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len1); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical2, + logical2 + len2); + goto out; + } + + if (io_stripe.physical != logical2) { + test_err("invalid physical address, expected %llu, got %llu", + logical2, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len2 != len - len1 - hole_len) { + test_err("invalid length, expected %llu, got %llu", + len - len1 - hole_len, len2); + ret = -EINVAL; + goto out; + } + + /* Check for the absence of the hole. */ + ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + hole_start, hole_start + SZ_64K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 logical2 = SZ_2M; + u64 len = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + /* Insert RAID extent 1. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + /* Insert RAID extent 2, directly adjacent to it. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1 + SZ_512K, (u64)SZ_1M); + goto out; + } + + /* Verify item 1 is truncated to 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify item 2's start is moved by 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical2 + SZ_512K, logical2 + len); + goto out; + } + + if (io_stripe.physical != logical2 + SZ_512K) { + test_err("invalid physical address, expected %llu got %llu", + logical2 + SZ_512K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify there's a hole at [1M+512K, 2M+512K] . */ + len = SZ_1M; + ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID [%llu, %llu] succeeded, should fail", + logical1 + SZ_512K, logical1 + SZ_512K + len); + goto out; + } + + /* Clean up after us. */ + ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then * delete the 1st 32K, making the new start address 1M+32K. */ @@ -94,45 +703,45 @@ static int test_front_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", logical, - logical + SZ_32K); + logical + SZ_16K); goto out; } - len = SZ_32K; - ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, + len -= SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_32K + len); + logical + SZ_16K, logical + SZ_64K); goto out; } - if (io_stripe.physical != logical + SZ_32K) { + if (io_stripe.physical != logical + SZ_16K) { test_err("invalid physical address, expected %llu, got %llu", - logical + SZ_32K, io_stripe.physical); + logical + SZ_16K, io_stripe.physical); ret = -EINVAL; goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); ret = -EINVAL; goto out; } ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); - if (!ret) { + if (ret != -ENODATA) { ret = -EINVAL; test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", - logical, logical + SZ_32K); + logical, logical + SZ_16K); goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K); out: btrfs_put_bioc(bioc); return ret; @@ -209,14 +818,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); + ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K); if (ret) { test_err("deleting RAID extent [%llu, %llu] failed", - logical + SZ_32K, logical + SZ_64K); + logical + SZ_48K, logical + SZ_64K); goto out; } - len = SZ_32K; + len = SZ_48K; ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); if (ret) { test_err("lookup of RAID extent [%llu, %llu] failed", logical, @@ -231,9 +840,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans) goto out; } - if (len != SZ_32K) { + if (len != SZ_48K) { test_err("invalid stripe length, expected %llu, got %llu", - (u64)SZ_32K, len); + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + len = SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical + SZ_48K, logical + SZ_64K); ret = -EINVAL; goto out; } @@ -456,6 +1075,10 @@ static const test_func_t tests[] = { test_create_update_delete, test_tail_delete, test_front_delete, + test_front_delete_prev_item, + test_punch_hole, + test_punch_hole_3extents, + test_delete_two_extents, }; static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) @@ -478,8 +1101,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) ret = PTR_ERR(root); goto out; } - btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, - BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + btrfs_set_super_incompat_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dc0b837efd5d..aca83a98b75a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ loop: cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); @@ -795,8 +797,7 @@ alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, - delayed_refs_bytes); + btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 184fa5c0062a..9f7c777af635 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int error); +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +static inline bool btrfs_abort_should_print_stack(int error) +{ + switch (error) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} /* * Call btrfs_abort_transaction as early as possible when an error condition is @@ -240,7 +254,7 @@ do { \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ __first = true; \ - if (WARN(abort_should_print_stack(error), \ + if (WARN(btrfs_abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index dfeee033f31f..43979891f7c8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf, return 0; } -__printf(4, 5) +__printf(5, 6) __cold -static void chunk_err(const struct extent_buffer *leaf, +static void chunk_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { - const struct btrfs_fs_info *fs_info = leaf->fs_info; - bool is_sb; + bool is_sb = !leaf; struct va_format vaf; va_list args; int i; int slot = -1; - /* Only superblock eb is able to have such small offset */ - is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); - if (!is_sb) { /* * Get the slot number by iterating through all slots, this @@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf, /* * The common chunk check which could also work on super block sys chunk array. * + * If @leaf is NULL, then @chunk must be an on-stack chunk item. + * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) + * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize) { - struct btrfs_fs_info *fs_info = leaf->fs_info; u64 length; u64 chunk_end; u64 stripe_len; @@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u16 sub_stripes; u64 type; u64 features; + u32 chunk_sector_size; bool mixed = false; int raid_index; int nparity; int ncopies; - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); + if (leaf) { + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); + } else { + length = btrfs_stack_chunk_length(chunk); + stripe_len = btrfs_stack_chunk_stripe_len(chunk); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); + type = btrfs_stack_chunk_type(chunk); + chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); + } raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; if (unlikely(!num_stripes)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (unlikely(num_stripes < ncopies)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } if (unlikely(nparity && num_stripes == nparity)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!IS_ALIGNED(logical, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", - logical, fs_info->sectorsize); + logical, sectorsize); return -EUCLEAN; } - if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { - chunk_err(leaf, chunk, logical, + if (unlikely(chunk_sector_size != sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", - btrfs_chunk_sector_size(leaf, chunk), - fs_info->sectorsize); + chunk_sector_size, sectorsize); return -EUCLEAN; } - if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (unlikely(check_add_overflow(logical, length, &chunk_end))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical start and length, have logical start %llu length %llu", logical, length); return -EUCLEAN; } if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; @@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * Thus it should be a good way to catch obvious bitflips. */ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "chunk length too large: have %llu limit %llu", length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); + BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); return -EUCLEAN; } if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; @@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; @@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (!mixed) { if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } @@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), - BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } @@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (unlikely(btrfs_chunk_item_size(num_stripes) != btrfs_item_size(leaf, slot))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } out: - return btrfs_check_chunk_valid(leaf, chunk, key->offset); + return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, + fs_info->sectorsize); } __printf(3, 4) diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index db67f96cbe4b..eb201f4ec3c7 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -10,6 +10,7 @@ #include <uapi/linux/btrfs_tree.h> struct extent_buffer; +struct btrfs_fs_info; struct btrfs_chunk; struct btrfs_key; @@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); int btrfs_verify_level_key(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c8d6587688b3..955d1677e865 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -590,7 +590,6 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -3588,7 +3587,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -4566,7 +4564,6 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4776,7 +4773,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index aca2861f2187..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1cccaf9c2b0d..3f8afbd1ebb5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,8 +13,8 @@ #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" -#include "ctree.h" #include "disk-io.h" +#include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" @@ -48,6 +48,7 @@ struct btrfs_io_geometry { u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; + bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -797,6 +798,10 @@ static int get_canonical_dev_path(const char *dev_path, char *canonical) if (ret) goto out; resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) { + ret = PTR_ERR(resolved_path); + goto out; + } ret = strscpy(canonical, resolved_path, PATH_MAX); out: kfree(path_buf); @@ -1298,6 +1303,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1327,7 +1333,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif return 0; } @@ -2045,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2741,11 +2762,9 @@ next_slot: device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(trans, leaf); - } path->slots[0]++; goto next_slot; @@ -3038,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(trans, leaf); - out: btrfs_free_path(path); return ret; @@ -3748,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -5513,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma btrfs_free_chunk_map(map); } +static int btrfs_chunk_map_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_chunk_map *new_map = + rb_entry(new, struct btrfs_chunk_map, rb_node); + const struct btrfs_chunk_map *exist_map = + rb_entry(exist, struct btrfs_chunk_map, rb_node); + + if (new_map->start == exist_map->start) + return 0; + if (new_map->start < exist_map->start) + return -1; + return 1; +} + EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { - struct rb_node **p; - struct rb_node *parent = NULL; - bool leftmost = true; + struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); - p = &fs_info->mapping_tree.rb_root.rb_node; - while (*p) { - struct btrfs_chunk_map *entry; - - parent = *p; - entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - - if (map->start < entry->start) { - p = &(*p)->rb_left; - } else if (map->start > entry->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&fs_info->mapping_tree_lock); - return -EEXIST; - } + exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, + btrfs_chunk_map_cmp); + + if (exist) { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; } - rb_link_node(&map->rb_node, parent, p); - rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); @@ -5959,6 +5974,76 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) +{ + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; + + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } + + /* If no read-preferred device is set use the first stripe. */ + return first; +} + +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; + } + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; +} +#endif + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) @@ -5988,6 +6073,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && @@ -6346,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, { dst->dev = map->stripes[io_geom->stripe_index].dev; - if (io_geom->op == BTRFS_MAP_READ && - btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); @@ -6362,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, - enum btrfs_map_op op, int mirror_num) + struct btrfs_io_geometry *io_geom) { if (!smap) return false; @@ -6370,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, if (num_alloc_stripes != 1) return false; - if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; @@ -6579,6 +6671,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); + io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); if (dev_replace->replace_task != current) down_read(&dev_replace->rwsem); @@ -6647,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, - io_geom.mirror_num)) { + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; @@ -6662,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, goto out; } bioc->map_type = map->type; + bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the @@ -7002,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, warn_32bit_meta_chunk(fs_info, logical, length, type); #endif - /* - * Only need to verify chunk item if we're reading from sys chunk array, - * as chunk item in tree block is already verified by tree-checker. - */ - if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_check_chunk_valid(leaf, chunk, logical); - if (ret) - return ret; - } - map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ @@ -7072,6 +7155,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7117,8 +7201,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7269,16 +7357,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; - u32 num_stripes; u32 array_size; - u32 len = 0; u32 cur_offset; - u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7301,10 +7384,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) cur_offset = 0; while (cur_offset < array_size) { - disk_key = (struct btrfs_disk_key *)array_ptr; - len = sizeof(*disk_key); - if (cur_offset + len > array_size) - goto out_short_read; + struct btrfs_chunk *chunk; + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; + u32 len = sizeof(*disk_key); + + /* + * The sys_chunk_array has been already verified at super block + * read time. Only do ASSERT()s for basic checks. + */ + ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); @@ -7312,44 +7400,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type != BTRFS_CHUNK_ITEM_KEY) { - btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); - ret = -EIO; - break; - } + ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be present, - * exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) @@ -7362,13 +7420,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; - -out_short_read: - btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", - len, cur_offset); - clear_extent_buffer_uptodate(sb); - free_extent_buffer_stale(sb); - return -EIO; } /* @@ -7568,8 +7619,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; @@ -7745,8 +7794,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3a416b1bc24c..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -296,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -303,6 +306,12 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, +#endif BTRFS_NR_READ_POLICY, }; @@ -417,6 +426,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -431,6 +442,15 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; #ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -485,6 +505,7 @@ struct btrfs_io_context { struct bio *orig_bio; atomic_t error; u16 max_errors; + bool use_rst; u64 logical; u64 size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index bc18710d1dcf..3e0edbcf73e1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ddf0d5a448a7..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 11ed523e528e..73e0aa9fc08a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned @@ -2651,3 +2652,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->zone_active_bgs_lock); } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info: the space to work on + * @num_bytes: targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + while (num_bytes > 0) { + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg = NULL; + bool found = false; + u64 reclaimed = 0; + + /* + * Here, we choose a fully zone_unusable block group. It's + * technically possible to reset a partly zone_unusable block + * group, which still has some free space left. However, + * handling that needs to cope with the allocation side, which + * makes the logic more complex. So, let's handle the easy case + * for now. + */ + spin_lock(&fs_info->unused_bgs_lock); + list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { + if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) + continue; + + /* + * Use trylock to avoid locking order violation. In + * btrfs_reclaim_bgs_work(), the lock order is + * &bg->lock -> &fs_info->unused_bgs_lock. We skip a + * block group if we cannot take its lock. + */ + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + found = true; + break; + } + if (!found) { + spin_unlock(&fs_info->unused_bgs_lock); + return 0; + } + + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Since the block group is fully zone_unusable and we cannot + * allocate from this block group anymore, we don't need to set + * this block group read-only. + */ + + down_read(&fs_info->dev_replace.rwsem); + map = bg->physical_map; + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + unsigned int nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, + stripe->physical >> SECTOR_SHIFT, + zone_size_sectors); + memalloc_nofs_restore(nofs_flags); + + if (ret) { + up_read(&fs_info->dev_replace.rwsem); + return ret; + } + } + up_read(&fs_info->dev_replace.rwsem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + ASSERT(!btrfs_is_block_group_used(bg)); + if (bg->ro) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + continue; + } + + reclaimed = bg->alloc_offset; + bg->zone_unusable = bg->length - bg->zone_capacity; + bg->alloc_offset = 0; + /* + * This holds because we currently reset fully used then freed + * block group. + */ + ASSERT(reclaimed == bg->zone_capacity); + bg->free_space_ctl->free_space += reclaimed; + space_info->bytes_zone_unusable -= reclaimed; + spin_unlock(&bg->lock); + btrfs_return_free_space(space_info, reclaimed); + spin_unlock(&space_info->lock); + + if (num_bytes <= reclaimed) + break; + num_bytes -= reclaimed; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7612e6572605..9672bf4c3335 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, + u64 num_bytes) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) |