diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-17 12:38:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-17 12:38:04 -0700 |
commit | a1b547f0f217cfb06af7eb4ce8488b02d83a0370 (patch) | |
tree | 2935586a4371bfb9c5abec688dc279f99504cd08 /fs/btrfs/inode.c | |
parent | 6706415bf9f3dcb425f4b60a08a3a6f1d94ec0e0 (diff) | |
parent | 8e7860543a94784d744c7ce34b78a2e11beefa5c (diff) | |
download | lwn-a1b547f0f217cfb06af7eb4ce8488b02d83a0370.tar.gz lwn-a1b547f0f217cfb06af7eb4ce8488b02d83a0370.zip |
Merge tag 'for-6.11-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"The highlights are new logic behind background block group reclaim,
automatic removal of qgroup after removing a subvolume and new
'rescue=' mount options.
The rest is optimizations, cleanups and refactoring.
User visible features:
- dynamic block group reclaim:
- tunable framework to avoid situations where eager data
allocations prevent creating new metadata chunks due to lack of
unallocated space
- reuse sysfs knob bg_reclaim_threshold (otherwise used only in
zoned mode) for a fixed value threshold
- new on/off sysfs knob "dynamic_reclaim" calculating the value
based on heuristics, aiming to keep spare working space for
relocating chunks but not to needlessly relocate partially
utilized block groups or reclaim newly allocated ones
- stats are exported in sysfs per block group type, files
"reclaim_*"
- this may increase IO load at unexpected times but the corner
case of no allocatable block groups is known to be worse
- automatically remove qgroup of deleted subvolumes:
- adjust qgroup removal conditions, make sure all related
subvolume data are already removed, or return EBUSY, also take
into account setting of sysfs drop_subtree_threshold
- also works in squota mode
- mount option updates: new modes of 'rescue=' that allow to mount
images (read-only) that could have been partially converted by user
space tools
- ignoremetacsums - invalid metadata checksums are ignored
- ignoresuperflags - super block flags that track conversion in
progress (like UUID or checksums)
Core:
- size of struct btrfs_inode is now below 1024 (on a release config),
improved memory packing and other secondary effects
- switch tracking of open inodes from rb-tree to xarray, minor
performance improvement
- reduce number of empty transaction commits when there are no dirty
data/metadata
- memory allocation optimizations (reduced numbers, reordering out of
critical sections)
- extent map structure optimizations and refactoring, more sanity
checks
- more subpage in zoned mode preparations or fixes
- general snapshot code cleanups, improvements and documentation
- tree-checker updates: more file extent ram_bytes fixes, continued
- raid-stripe-tree update (not backward compatible):
- remove extent encoding field from the structure, can be inferred
from other information
- requires btrfs-progs 6.9.1 or newer
- cleanups and refactoring
- error message updates
- error handling improvements
- return type and parameter cleanups and improvements"
* tag 'for-6.11-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (152 commits)
btrfs: fix extent map use-after-free when adding pages to compressed bio
btrfs: fix bitmap leak when loading free space cache on duplicate entry
btrfs: remove the BUG_ON() inside extent_range_clear_dirty_for_io()
btrfs: move extent_range_clear_dirty_for_io() into inode.c
btrfs: enhance compression error messages
btrfs: fix data race when accessing the last_trans field of a root
btrfs: rename the extra_gfp parameter of btrfs_alloc_page_array()
btrfs: remove the extra_gfp parameter from btrfs_alloc_folio_array()
btrfs: introduce new "rescue=ignoresuperflags" mount option
btrfs: introduce new "rescue=ignoremetacsums" mount option
btrfs: output the unrecognized super block flags as hex
btrfs: remove unused Opt enums
btrfs: tree-checker: add extra ram_bytes and disk_num_bytes check
btrfs: fix the ram_bytes assignment for truncated ordered extents
btrfs: make validate_extent_map() catch ram_bytes mismatch
btrfs: ignore incorrect btrfs_file_extent_item::ram_bytes
btrfs: cleanup the bytenr usage inside btrfs_extent_item_to_extent_map()
btrfs: fix typo in error message in btrfs_validate_super()
btrfs: move the direct IO code into its own file
btrfs: pass a btrfs_inode to btrfs_set_prop()
...
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 1459 |
1 files changed, 316 insertions, 1143 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d62c96f00ff8..01eab6955647 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,31 +70,13 @@ #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" +#include "fiemap.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; -struct btrfs_dio_data { - ssize_t submitted; - struct extent_changeset *data_reserved; - struct btrfs_ordered_extent *ordered; - bool data_space_reserved; - bool nocow_done; -}; - -struct btrfs_dio_private { - /* Range of I/O */ - u64 file_offset; - u32 bytes; - - /* This must be last */ - struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; - struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -137,11 +119,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) @@ -877,7 +854,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) - return btrfs_compress_heuristic(&inode->vfs_inode, start, end); + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -890,6 +867,26 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(NULL, inode, small_write); } +static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long end_index = end >> PAGE_SHIFT; + struct page *page; + int ret = 0; + + for (unsigned long index = start >> PAGE_SHIFT; + index <= end_index; index++) { + page = find_get_page(inode->i_mapping, index); + if (unlikely(!page)) { + if (!ret) + ret = -ENOENT; + continue; + } + clear_page_dirty_for_io(page); + put_page(page); + } + return ret; +} + /* * Work queue call back to started compression on a file and pages. * @@ -931,7 +928,16 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* + * All the folios should have been locked thus no failure. + * + * And even if some folios are missing, btrfs_compress_folios() + * would handle them correctly, so here just do an ASSERT() check for + * early logic errors. + */ + ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between @@ -1152,6 +1158,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_state *cached = NULL; @@ -1198,29 +1205,22 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ - em = create_io_em(inode, start, - async_extent->ram_size, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.ram_bytes = async_extent->ram_size; + file_extent.num_bytes = async_extent->ram_size; + file_extent.offset = 0; + file_extent.compression = async_extent->compress_type; + + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - async_extent->ram_size, /* ram_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* disk_num_bytes */ - 0, /* offset */ - 1 << BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1264,8 +1264,8 @@ out_free_reserve: kfree(async_extent); } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, - u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1279,15 +1279,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; + if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) + alloc_hint = extent_map_block_start(em); if (em) free_extent_map(em); } else { - alloc_hint = em->block_start; + alloc_hint = extent_map_block_start(em); free_extent_map(em); } } @@ -1375,7 +1375,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* * Relocation relies on the relocated extents to have exactly the same @@ -1395,6 +1395,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, @@ -1431,18 +1432,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, extent_reserved = true; ram_size = ins.offset; + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = ins.offset; + file_extent.ram_bytes = ins.offset; + file_extent.offset = 0; + file_extent.compression = BTRFS_COMPRESS_NONE; lock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); - em = create_io_em(inode, start, ins.offset, /* len */ - start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - ram_size, /* ram_bytes */ - BTRFS_COMPRESS_NONE, /* compress_type */ - BTRFS_ORDERED_REGULAR /* type */); + em = btrfs_create_io_em(inode, start, &file_extent, + BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); @@ -1451,10 +1452,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, - ram_size, ins.objectid, cur_alloc_size, - 0, 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, + 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, start + ram_size - 1, &cached); @@ -1617,10 +1616,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ u64 alloc_hint = 0; if (do_free) { - struct async_chunk *async_chunk; struct async_cow *async_cow; - async_chunk = container_of(work, struct async_chunk, work); btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1850,13 +1847,11 @@ struct can_nocow_file_extent_args { */ bool free_path; - /* Output fields. Only set when can_nocow_file_extent() returns 1. */ - - u64 disk_bytenr; - u64 disk_num_bytes; - u64 extent_offset; - /* Number of bytes that can be written to in NOCOW mode. */ - u64 num_bytes; + /* + * Output fields. Only set when can_nocow_file_extent() returns 1. + * The expected file extent for the NOCOW write. + */ + struct btrfs_file_extent file_extent; }; /* @@ -1878,6 +1873,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; struct btrfs_root *csum_root; + u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1890,11 +1886,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; - /* Can't access these fields unless we know it's not an inline extent. */ - args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - args->extent_offset = btrfs_file_extent_offset(leaf, fi); - if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; @@ -1910,7 +1901,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, goto out; /* An explicit hole, must COW. */ - if (args->disk_bytenr == 0) + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ @@ -1921,6 +1912,12 @@ static int can_nocow_file_extent(struct btrfs_path *path, extent_end = btrfs_file_extent_end(path); + args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); + args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); + /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid @@ -1929,8 +1926,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, btrfs_release_path(path); ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->extent_offset, - args->disk_bytenr, args->strict, path); + key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1951,18 +1948,18 @@ static int can_nocow_file_extent(struct btrfs_path *path, atomic_read(&root->snapshot_force_cow)) goto out; - args->disk_bytenr += args->extent_offset; - args->disk_bytenr += args->start - key->offset; - args->num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; + args->file_extent.offset += args->start - key->offset; + io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr); - ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr, - args->disk_bytenr + args->num_bytes - 1, + csum_root = btrfs_csum_root(root->fs_info, io_start); + ret = btrfs_lookup_csums_list(csum_root, io_start, + io_start + args->file_extent.num_bytes - 1, NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) @@ -2021,7 +2018,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 ram_bytes; u64 nocow_end; int extent_type; bool is_prealloc; @@ -2100,7 +2096,6 @@ next_slot: ret = -EUCLEAN; goto error; } - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* @@ -2120,7 +2115,9 @@ next_slot: goto must_cow; ret = 0; - nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + nocow_bg = btrfs_inc_nocow_writers(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* @@ -2156,21 +2153,16 @@ must_cow: } } - nocow_end = cur_offset + nocow_args.num_bytes - 1; + nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { - u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, nocow_args.num_bytes, - orig_start, - nocow_args.disk_bytenr, /* block_start */ - nocow_args.num_bytes, /* block_len */ - nocow_args.disk_num_bytes, /* orig_block_len */ - ram_bytes, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_PREALLOC); + em = btrfs_create_io_em(inode, cur_offset, + &nocow_args.file_extent, + BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); @@ -2182,12 +2174,10 @@ must_cow: } ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, nocow_args.num_bytes, - nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + &nocow_args.file_extent, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW), - BTRFS_COMPRESS_NONE); + : (1 << BTRFS_ORDERED_NOCOW)); btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { @@ -2601,44 +2591,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) -{ - u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_ordered_extent *new; - int ret; - - /* Must always be called for the beginning of an ordered extent. */ - if (WARN_ON_ONCE(start != ordered->disk_bytenr)) - return -EINVAL; - - /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) { - refcount_inc(&ordered->refs); - bbio->ordered = ordered; - return 0; - } - - /* - * Don't split the extent_map for NOCOW extents, as we're writing into - * a pre-existing one. - */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); - if (ret) - return ret; - } - - new = btrfs_split_ordered_extent(ordered, len); - if (IS_ERR(new)) - return PTR_ERR(new); - bbio->ordered = new; - return 0; -} - /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. @@ -2681,7 +2633,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (IS_ERR(em)) return PTR_ERR(em); - if (em->block_start != EXTENT_MAP_HOLE) + if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; @@ -3037,10 +2989,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; - ram_bytes = num_bytes; - } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3056,7 +3006,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); - return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } @@ -3068,7 +3018,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { - struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; @@ -3302,7 +3252,7 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { - if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && + if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); @@ -3596,7 +3546,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, last_objectid, root); + inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; @@ -3781,6 +3731,30 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 1; } +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (WARN_ON_ONCE(inode->file_extent_tree)) + return 0; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) + return 0; + if (!S_ISREG(inode->vfs_inode.i_mode)) + return 0; + if (btrfs_is_free_space_inode(inode)) + return 0; + + inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!inode->file_extent_tree) + return -ENOMEM; + + extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + + return 0; +} + /* * read an inode from the btree into the in-memory inode */ @@ -3800,6 +3774,10 @@ static int btrfs_read_locked_inode(struct inode *inode, bool filled = false; int first_xattr_slot; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + return ret; + ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3810,7 +3788,7 @@ static int btrfs_read_locked_inode(struct inode *inode, return -ENOMEM; } - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3856,7 +3834,9 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = (u64)-1; + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); @@ -4038,13 +4018,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; + struct btrfs_key key; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); + btrfs_get_inode_key(inode, &key); + ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4308,7 +4290,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - objectid = inode->location.objectid; + objectid = inode->ref_root_id; } else { WARN_ON(1); fscrypt_free_filename(&fname); @@ -4539,11 +4521,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = PTR_ERR(trans); goto out_release; } - ret = btrfs_record_root_in_trans(trans, root); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_end_trans; - } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; @@ -4967,11 +4944,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) } hole_em->start = cur_offset; hole_em->len = hole_size; - hole_em->orig_start = cur_offset; - hole_em->block_start = EXTENT_MAP_HOLE; - hole_em->block_len = 0; - hole_em->orig_block_len = 0; + hole_em->disk_bytenr = EXTENT_MAP_HOLE; + hole_em->disk_num_bytes = 0; hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); @@ -5049,7 +5024,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); if (btrfs_is_zoned(fs_info)) { - ret = btrfs_wait_ordered_range(inode, + ret = btrfs_wait_ordered_range(BTRFS_I(inode), ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) @@ -5079,7 +5054,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(inode, 0, (u64)-1); + err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5493,59 +5468,52 @@ out: return err; } -static void inode_tree_add(struct btrfs_inode *inode) +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) { struct btrfs_root *root = inode->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - struct rb_node *new = &inode->rb_node; - u64 ino = btrfs_ino(inode); + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; if (inode_unhashed(&inode->vfs_inode)) - return; - parent = NULL; - spin_lock(&root->inode_lock); - p = &root->inode_tree.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); + return 0; - if (ino < btrfs_ino(entry)) - p = &parent->rb_left; - else if (ino > btrfs_ino(entry)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_replace_node(parent, new, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - return; - } + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; } - rb_link_node(new, parent, p); - rb_insert_color(new, &root->inode_tree); - spin_unlock(&root->inode_lock); + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; } -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; - int empty = 0; + struct btrfs_inode *entry; + bool empty = false; - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&inode->rb_node)) { - rb_erase(&inode->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&inode->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + if (entry == inode) + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty && btrfs_root_refs(&root->root_item) == 0) { - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); + xa_lock(&root->inodes); + empty = xa_empty(&root->inodes); + xa_unlock(&root->inodes); if (empty) btrfs_add_dead_root(root); } @@ -5556,10 +5524,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->location.objectid = args->ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; + btrfs_set_inode_number(BTRFS_I(inode), args->ino); BTRFS_I(inode)->root = btrfs_grab_root(args->root); if (args->root && args->root == args->root->fs_info->tree_root && @@ -5573,12 +5538,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->ino == BTRFS_I(inode)->location.objectid && + return args->ino == btrfs_ino(BTRFS_I(inode)) && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5587,7 +5551,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked_rcu(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -5599,41 +5563,44 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, * allocator. NULL is also valid but may require an additional allocation * later. */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path) +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { struct inode *inode; + int ret; - inode = btrfs_iget_locked(s, ino, root); + inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - int ret; + if (!(inode->i_state & I_NEW)) + return inode; - ret = btrfs_read_locked_inode(inode, path); - if (!ret) { - inode_tree_add(BTRFS_I(inode)); - unlock_new_inode(inode); - } else { - iget_failed(inode); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode, this means the inode item - * was not found. - */ - if (ret > 0) - ret = -ENOENT; - inode = ERR_PTR(ret); - } - } + ret = btrfs_read_locked_inode(inode, path); + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_read_locked_inode(), this means the inode item was not found. + */ + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto error; + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret < 0) + goto error; + + unlock_new_inode(inode); return inode; +error: + iget_failed(inode); + return ERR_PTR(ret); } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, ino, root, NULL); + return btrfs_iget_path(ino, root, NULL); } static struct inode *new_simple_dir(struct inode *dir, @@ -5647,10 +5614,11 @@ static struct inode *new_simple_dir(struct inode *dir, return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry @@ -5704,7 +5672,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, location.objectid, root); + inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5728,7 +5696,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir, &location, root); } else { - inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); + inode = btrfs_iget(location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) @@ -5948,7 +5916,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - put = btrfs_readdir_get_delayed_items(inode, private->last_index, + put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index, &ins_list, &del_list); again: @@ -6038,7 +6006,7 @@ nopos: ret = 0; err: if (put) - btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); + btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6123,7 +6091,7 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.ino = BTRFS_I(inode)->location.objectid; + args.ino = btrfs_ino(BTRFS_I(inode)); args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6230,7 +6198,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_root *root; struct btrfs_inode_item *inode_item; - struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; @@ -6239,6 +6206,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_item_batch batch; unsigned long ptr; int ret; + bool xa_reserved = false; path = btrfs_alloc_path(); if (!path) @@ -6248,10 +6216,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; + ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + if (ret) + goto out; + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; - inode->i_ino = objectid; + btrfs_set_inode_number(BTRFS_I(inode), objectid); + + ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); + if (ret) + goto out; + xa_reserved = true; if (args->orphan) { /* @@ -6266,8 +6243,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) goto out; } - /* index_cnt is ignored for everything but a dir. */ - BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + + if (S_ISDIR(inode->i_mode)) + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6294,11 +6273,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) @@ -6397,8 +6371,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ - parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - BTRFS_I(dir)->root); + parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { @@ -6426,7 +6399,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(BTRFS_I(inode)); + ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); + if (WARN_ON(ret)) { + /* Shouldn't happen, we used xa_reserve() before. */ + btrfs_abort_transaction(trans, ret); + goto discard; + } trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6454,6 +6432,9 @@ discard: ihold(inode); discard_new_inode(inode); out: + if (xa_reserved) + xa_release(&root->inodes, objectid); + btrfs_free_path(path); return ret; } @@ -6818,7 +6799,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) free_extent_map(em); else goto out; @@ -6829,9 +6810,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto out; } em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; em->len = (u64)-1; - em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { @@ -6921,9 +6901,8 @@ next: /* New extent overlaps with existing one */ em->start = start; - em->orig_start = start; em->len = found_key.offset - start; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; goto insert; } @@ -6947,7 +6926,7 @@ next: * * Other members are not utilized for inline extents. */ - ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); ret = read_inline_extent(inode, path, page); @@ -6957,9 +6936,8 @@ next: } not_found: em->start = start; - em->orig_start = start; em->len = len; - em->block_start = EXTENT_MAP_HOLE; + em->disk_bytenr = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); @@ -6986,84 +6964,6 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - const u64 start, - const u64 len, - const u64 orig_start, - const u64 block_start, - const u64 block_len, - const u64 orig_block_len, - const u64 ram_bytes, - const int type) -{ - struct extent_map *em = NULL; - struct btrfs_ordered_extent *ordered; - - if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, block_start, - block_len, orig_block_len, ram_bytes, - BTRFS_COMPRESS_NONE, /* compress_type */ - type); - if (IS_ERR(em)) - goto out; - } - ordered = btrfs_alloc_ordered_extent(inode, start, len, len, - block_start, block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (IS_ERR(ordered)) { - if (em) { - free_extent_map(em); - btrfs_drop_extent_map_range(inode, start, - start + len - 1, false); - } - em = ERR_CAST(ordered); - } else { - ASSERT(!dio_data->ordered); - dio_data->ordered = ordered; - } - out: - - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 len) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_map *em; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - - alloc_hint = get_extent_allocation_hint(inode, start, len); -again: - ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); - if (ret == -EAGAIN) { - ASSERT(btrfs_is_zoned(fs_info)); - wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - goto again; - } - if (ret) - return ERR_PTR(ret); - - em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, - ins.objectid, ins.offset, ins.offset, - ins.offset, BTRFS_ORDERED_REGULAR); - btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); - - return em; -} - static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; @@ -7098,8 +6998,8 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict) + struct btrfs_file_extent *file_extent, + bool nowait, bool strict) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7149,8 +7049,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); - if (ram_bytes) - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; @@ -7168,14 +7066,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } ret = 0; - if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) + if (btrfs_extent_readonly(fs_info, + nocow_args.file_extent.disk_bytenr + + nocow_args.file_extent.offset)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + nocow_args.num_bytes, + range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { @@ -7184,117 +7084,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - if (orig_start) - *orig_start = key.offset - nocow_args.extent_offset; - if (orig_block_len) - *orig_block_len = nocow_args.disk_num_bytes; + if (file_extent) + memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); - *len = nocow_args.num_bytes; + *len = nocow_args.file_extent.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, - unsigned int iomap_flags) -{ - const bool writing = (iomap_flags & IOMAP_WRITE); - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - int ret = 0; - - while (1) { - if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; - } else { - lock_extent(io_tree, lockstart, lockend, cached_state); - } - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure there's no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, - lockend - lockstart + 1); - - /* - * We need to make sure there are no buffered pages in this - * range either, we could have raced between the invalidate in - * generic_file_direct_write and locking the extent. The - * invalidate needs to happen so that reads after a write do not - * get stale data. - */ - if (!ordered && - (!writing || !filemap_range_has_page(inode->i_mapping, - lockstart, lockend))) - break; - - unlock_extent(io_tree, lockstart, lockend, cached_state); - - if (ordered) { - if (nowait) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - break; - } - /* - * If we are doing a DIO read and the ordered extent we - * found is for a buffered write, we can not wait for it - * to complete and retry, because if we do so we can - * deadlock with concurrent buffered writes on page - * locks. This happens only if our DIO read covers more - * than one extent map, if at this point has already - * created an ordered extent for a previous extent map - * and locked its range in the inode's io tree, and a - * concurrent write against that previous extent map's - * range and this range started (we unlock the ranges - * in the io tree only when the bios complete and - * buffered writes always lock pages before attempting - * to lock range in the io tree). - */ - if (writing || - test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(ordered); - else - ret = nowait ? -EAGAIN : -ENOTBLK; - btrfs_put_ordered_extent(ordered); - } else { - /* - * We could trigger writeback for this range (and wait - * for it to complete) and then invalidate the pages for - * this range (through invalidate_inode_pages2_range()), - * but that can lead us to a deadlock with a concurrent - * call to readahead (a buffered read or a defrag call - * triggered a readahead) on a page lock due to an - * ordered dio extent we created before but did not have - * yet a corresponding bio submitted (whence it can not - * complete), which makes readahead wait for that - * ordered extent to complete while holding a lock on - * that page. - */ - ret = nowait ? -EAGAIN : -ENOTBLK; - } - - if (ret) - break; - - cond_resched(); - } - - return ret; -} - /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, - u64 len, u64 orig_start, u64 block_start, - u64 block_len, u64 orig_block_len, - u64 ram_bytes, int compress_type, - int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, + const struct btrfs_file_extent *file_extent, + int type) { struct extent_map *em; int ret; @@ -7313,32 +7116,26 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, switch (type) { case BTRFS_ORDERED_PREALLOC: - /* Uncompressed extents. */ - ASSERT(block_len == len); - /* We're only referring part of a larger preallocated extent. */ - ASSERT(block_len <= ram_bytes); + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break; case BTRFS_ORDERED_REGULAR: - /* Uncompressed extents. */ - ASSERT(block_len == len); - /* COW results a new extent matching our file extent size. */ - ASSERT(orig_block_len == len); - ASSERT(ram_bytes == len); + ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); + ASSERT(file_extent->ram_bytes == file_extent->num_bytes); /* Since it's a new extent, we should not have any offset. */ - ASSERT(orig_start == start); + ASSERT(file_extent->offset == 0); break; case BTRFS_ORDERED_COMPRESSED: /* Must be compressed. */ - ASSERT(compress_type != BTRFS_COMPRESS_NONE); + ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE); /* * Encoded write can make us to refer to part of the * uncompressed extent. */ - ASSERT(len <= ram_bytes); + ASSERT(file_extent->num_bytes <= file_extent->ram_bytes); break; } @@ -7347,16 +7144,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, return ERR_PTR(-ENOMEM); em->start = start; - em->orig_start = orig_start; - em->len = len; - em->block_len = block_len; - em->block_start = block_start; - em->orig_block_len = orig_block_len; - em->ram_bytes = ram_bytes; + em->len = file_extent->num_bytes; + em->disk_bytenr = file_extent->disk_bytenr; + em->disk_num_bytes = file_extent->disk_num_bytes; + em->ram_bytes = file_extent->ram_bytes; em->generation = -1; + em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, compress_type); + extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7368,580 +7164,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, return em; } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct inode *inode, - struct btrfs_dio_data *dio_data, - u64 start, u64 *lenp, - unsigned int iomap_flags) -{ - const bool nowait = (iomap_flags & IOMAP_NOWAIT); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em = *map; - int type; - u64 block_start, orig_start, orig_block_len, ram_bytes; - struct btrfs_block_group *bg; - bool can_nocow = false; - bool space_reserved = false; - u64 len = *lenp; - u64 prev_len; - int ret = 0; - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if ((em->flags & EXTENT_FLAG_PREALLOC) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - if (em->flags & EXTENT_FLAG_PREALLOC) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false, false) == 1) { - bg = btrfs_inc_nocow_writers(fs_info, block_start); - if (bg) - can_nocow = true; - } - } - - prev_len = len; - if (can_nocow) { - struct extent_map *em2; - - /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - nowait); - if (ret < 0) { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - btrfs_dec_nocow_writers(bg); - if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) - ret = -EAGAIN; - goto out; - } - space_reserved = true; - - em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, - orig_start, block_start, - len, orig_block_len, - ram_bytes, type); - btrfs_dec_nocow_writers(bg); - if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); - *map = em2; - em = em2; - } - - if (IS_ERR(em2)) { - ret = PTR_ERR(em2); - goto out; - } - - dio_data->nocow_done = true; - } else { - /* Our caller expects us to free the input extent map. */ - free_extent_map(em); - *map = NULL; - - if (nowait) { - ret = -EAGAIN; - goto out; - } - - /* - * If we could not allocate data space before locking the file - * range and we can't do a NOCOW write, then we have to fail. - */ - if (!dio_data->data_space_reserved) { - ret = -ENOSPC; - goto out; - } - - /* - * We have to COW and we have already reserved data space before, - * so now we reserve only metadata. - */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, - false); - if (ret < 0) - goto out; - space_reserved = true; - - em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - *map = em; - len = min(len, em->len - (start - em->start)); - if (len < prev_len) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - prev_len - len, true); - } - - /* - * We have created our ordered extent, so we can now release our reservation - * for an outstanding extent. - */ - btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - - /* - * Need to update the i_size under the extent lock so buffered - * readers will get the updated i_size when we unlock. - */ - if (start + len > i_size_read(inode)) - i_size_write(inode, start + len); -out: - if (ret && space_reserved) { - btrfs_delalloc_release_extents(BTRFS_I(inode), len); - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } - *lenp = len; - return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_map *em; - struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = iter->private; - u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - u64 len = length; - const u64 data_alloc_len = length; - bool unlock_extents = false; - - /* - * We could potentially fault if we have a buffer > PAGE_SIZE, and if - * we're NOWAIT we may submit a bio for a partial range and return - * EIOCBQUEUED, which would result in an errant short read. - * - * The best way to handle this would be to allow for partial completions - * of iocb's, so we could submit the partial bio, return and fault in - * the rest of the pages, and then submit the io for the rest of the - * range. However we don't have that currently, so simply return - * -EAGAIN at this point so that the normal path is used. - */ - if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) - return -EAGAIN; - - /* - * Cap the size of reads to that usually seen in buffered I/O as we need - * to allocate a contiguous array for the checksums. - */ - if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - - lockstart = start; - lockend = start + len - 1; - - /* - * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't - * enough if we've written compressed pages to this area, so we need to - * flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk - the first flush only starts - * compression on the data, while keeping the pages locked, so by the - * time the second flush returns we know bios for the compressed pages - * were submitted and finished, and the pages no longer under writeback. - * - * If we have a NOWAIT request and we have any pages in the range that - * are locked, likely due to compression still in progress, we don't want - * to block on page locks. We also don't want to block on pages marked as - * dirty or under writeback (same as for the non-compression case). - * iomap_dio_rw() did the same check, but after that and before we got - * here, mmap'ed writes may have happened or buffered reads started - * (readpage() and readahead(), which lock pages), as we haven't locked - * the file range yet. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - if (flags & IOMAP_NOWAIT) { - if (filemap_range_needs_writeback(inode->i_mapping, - lockstart, lockend)) - return -EAGAIN; - } else { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; - } - } - - memset(dio_data, 0, sizeof(*dio_data)); - - /* - * We always try to allocate data space and must do it before locking - * the file range, to avoid deadlocks with concurrent writes to the same - * range if the range has several extents and the writes don't expand the - * current i_size (the inode lock is taken in shared mode). If we fail to - * allocate data space here we continue and later, after locking the - * file range, we fail with ENOSPC only if we figure out we can not do a - * NOCOW write. - */ - if (write && !(flags & IOMAP_NOWAIT)) { - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, data_alloc_len, false); - if (!ret) - dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) - goto err; - } - - /* - * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered IO, or we are doing a - * NOWAIT read/write and we need to block. - */ - ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); - if (ret < 0) - goto err; - - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because that's what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (extent_map_is_compressed(em) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - /* - * If we are in a NOWAIT context, return -EAGAIN in order to - * fallback to buffered IO. This is not only because we can - * block with buffered IO (no support for NOWAIT semantics at - * the moment) but also to avoid returning short reads to user - * space - this happens if we were able to read some data from - * previous non-compressed extents and then when we fallback to - * buffered IO, at btrfs_file_read_iter() by calling - * filemap_read(), we fail to fault in pages for the read buffer, - * in which case filemap_read() returns a short read (the number - * of bytes previously read is > 0, so it does not return -EFAULT). - */ - ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; - goto unlock_err; - } - - len = min(len, em->len - (start - em->start)); - - /* - * If we have a NOWAIT request and the range contains multiple extents - * (or a mix of extents and holes), then we return -EAGAIN to make the - * caller fallback to a context where it can do a blocking (without - * NOWAIT) request. This way we avoid doing partial IO and returning - * success to the caller, which is not optimal for writes and for reads - * it can result in unexpected behaviour for an application. - * - * When doing a read, because we use IOMAP_DIO_PARTIAL when calling - * iomap_dio_rw(), we can end up returning less data then what the caller - * asked for, resulting in an unexpected, and incorrect, short read. - * That is, the caller asked to read N bytes and we return less than that, - * which is wrong unless we are crossing EOF. This happens if we get a - * page fault error when trying to fault in pages for the buffer that is - * associated to the struct iov_iter passed to iomap_dio_rw(), and we - * have previously submitted bios for other extents in the range, in - * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of - * those bios have completed by the time we get the page fault error, - * which we return back to our caller - we should only return EIOCBQUEUED - * after we have submitted bios for all the extents in the range. - */ - if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); - ret = -EAGAIN; - goto unlock_err; - } - - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, &len, flags); - if (ret < 0) - goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); - if (dio_data->data_space_reserved) { - u64 release_offset; - u64 release_len = 0; - - if (dio_data->nocow_done) { - release_offset = start; - release_len = data_alloc_len; - } else if (len < data_alloc_len) { - release_offset = start + len; - release_len = data_alloc_len - len; - } - - if (release_len > 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - release_offset, - release_len); - } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; - } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_dev->bdev; - iomap->length = len; - free_extent_map(em); - - return 0; - -unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); -err: - if (dio_data->data_space_reserved) { - btrfs_free_reserved_data_space(BTRFS_I(inode), - dio_data->data_reserved, - start, data_alloc_len); - extent_changeset_free(dio_data->data_reserved); - } - - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned int flags, struct iomap *iomap) -{ - struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); - struct btrfs_dio_data *dio_data = iter->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - int ret = 0; - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); - return 0; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); - ret = -ENOTBLK; - } - if (write) { - btrfs_put_ordered_extent(dio_data->ordered); - dio_data->ordered = NULL; - } - - if (write) - extent_changeset_free(dio_data->data_reserved); - return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_inode *inode = bbio->inode; - struct bio *bio = &bbio->bio; - - if (bio->bi_status) { - btrfs_warn(inode->root->fs_info, - "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", - btrfs_ino(inode), bio->bi_opf, - dip->file_offset, dip->bytes, bio->bi_status); - } - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); - } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); - } - - bbio->bio.bi_private = bbio->private; - iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset) -{ - struct btrfs_bio *bbio = btrfs_bio(bio); - struct btrfs_dio_private *dip = - container_of(bbio, struct btrfs_dio_private, bbio); - struct btrfs_dio_data *dio_data = iter->private; - - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, - btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; - - dip->file_offset = file_offset; - dip->bytes = bio->bi_iter.bi_size; - - dio_data->submitted += bio->bi_iter.bi_size; - - /* - * Check if we are doing a partial write. If we are, we need to split - * the ordered extent to match the submitted bio. Hang on to the - * remaining unfinishable ordered_extent in dio_data so that it can be - * cancelled in iomap_end to avoid a deadlock wherein faulting the - * remaining pages is blocked on the outstanding ordered extent. - */ - if (iter->flags & IOMAP_WRITE) { - int ret; - - ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); - if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, - file_offset, dip->bytes, - !ret); - bio->bi_status = errno_to_blk_status(ret); - iomap_dio_bio_end_io(bio); - return; - } - } - - btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { - .submit_io = btrfs_dio_submit_io, - .bio_set = &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before) -{ - struct btrfs_dio_data data = { 0 }; - - return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - int ret; - - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - /* - * fiemap_prep() called filemap_write_and_wait() for the whole possible - * file range (0 to LLONG_MAX), but that is not enough if we have - * compression enabled. The first filemap_fdatawrite_range() only kicks - * in the compression of data (in an async thread) and will return - * before the compression is done and writeback is started. A second - * filemap_fdatawrite_range() is needed to wait for the compression to - * complete and writeback to start. We also need to wait for ordered - * extents to complete, because our fiemap implementation uses mainly - * file extent items to list the extents, searching for extent maps - * only for file ranges with holes or prealloc extents to figure out - * if we have delalloc in those ranges. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) - return ret; - } - - btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - - /* - * We did an initial flush to avoid holding the inode's lock while - * triggering writeback and waiting for the completion of IO and ordered - * extents. Now after we locked the inode we do it again, because it's - * possible a new write may have happened in between those two steps. - */ - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { - ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); - if (ret) { - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - return ret; - } - } - - ret = extent_fiemap(btrfs_inode, fieinfo, start, len); - btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - - return ret; -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -8198,7 +7420,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(&inode->vfs_inode, + ret = btrfs_wait_ordered_range(inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) @@ -8399,20 +7621,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; - struct extent_io_tree *file_extent_tree = NULL; - - /* Self tests may pass a NULL fs_info. */ - if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { - file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); - if (!file_extent_tree) - return NULL; - } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) { - kfree(file_extent_tree); + if (!ei) return NULL; - } ei->root = NULL; ei->generation = 0; @@ -8425,8 +7637,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; + /* + * ->index_cnt will be properly initialized later when creating a new + * inode (btrfs_create_new_inode()) or when reading an existing inode + * from disk (btrfs_read_locked_inode()). + */ ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; @@ -8453,20 +7669,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - ei->file_extent_tree = file_extent_tree; - if (file_extent_tree) { - extent_io_tree_init(fs_info, ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); - /* Lockdep class is set only for the file extent tree. */ - lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); - } + ei->file_extent_tree = NULL; + mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); - RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; @@ -8502,9 +7712,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); } - WARN_ON(inode->csum_bytes); - WARN_ON(inode->defrag_bytes); + if (!root || !btrfs_is_data_reloc_root(root)) + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8538,7 +7749,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } } btrfs_qgroup_check_reserved_leak(inode); - inode_tree_del(inode); + btrfs_del_inode_from_root(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); @@ -8572,7 +7783,6 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); - bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } @@ -8583,17 +7793,9 @@ int __init btrfs_init_cachep(void) SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) - goto fail; - - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_dio_private, bbio.bio), - BIOSET_NEED_BVECS)) - goto fail; + return -ENOMEM; return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, @@ -9586,11 +8788,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } em->start = cur_offset; - em->orig_start = cur_offset; em->len = ins.offset; - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->orig_block_len = ins.offset; + em->disk_bytenr = ins.objectid; + em->offset = 0; + em->disk_num_bytes = ins.offset; em->ram_bytes = ins.offset; em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; @@ -9956,7 +9157,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages, 0); + ret = btrfs_alloc_page_array(nr_pages, pages, false); if (ret) { ret = -ENOMEM; goto out; @@ -10033,7 +9234,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + ret = btrfs_wait_ordered_range(inode, start, lockend - start + 1); if (ret) goto out_unlock_inode; @@ -10053,7 +9254,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_extent; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* @@ -10074,33 +9275,33 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; - if (em->block_start == EXTENT_MAP_HOLE || + if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->block_start; + disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ - if (em->block_len > count) { + if (em->disk_num_bytes > count) { ret = -ENOBUFS; goto out_em; } - disk_io_size = em->block_len; - count = em->block_len; + disk_io_size = em->disk_num_bytes; + count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; - encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - disk_bytenr = em->block_start + (start - em->start); + disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -10155,6 +9356,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; + struct btrfs_file_extent file_extent; int compression; size_t orig_count; u64 start, end; @@ -10276,7 +9478,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, for (;;) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, @@ -10330,22 +9532,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_delalloc_release; extent_reserved = true; - em = create_io_em(inode, start, num_bytes, - start - encoded->unencoded_offset, ins.objectid, - ins.offset, ins.offset, ram_bytes, compression, - BTRFS_ORDERED_COMPRESSED); + file_extent.disk_bytenr = ins.objectid; + file_extent.disk_num_bytes = ins.offset; + file_extent.num_bytes = num_bytes; + file_extent.ram_bytes = ram_bytes; + file_extent.offset = encoded->unencoded_offset; + file_extent.compression = compression; + em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); - ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, - ins.objectid, ins.offset, - encoded->unencoded_offset, + ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED), - compression); + (1 << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -10549,7 +9751,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * file changes again after this, the user is doing something stupid and * we don't really care. */ - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) return ret; @@ -10635,12 +9837,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - if (em->block_start == EXTENT_MAP_HOLE) { + if (em->disk_bytenr == EXTENT_MAP_HOLE) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->block_start == EXTENT_MAP_INLINE) { + if (em->disk_bytenr == EXTENT_MAP_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -10658,12 +9860,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - logical_block_start = em->block_start + (start - em->start); + logical_block_start = extent_map_block_start(em) + (start - em->start); len = min(len, em->len - (start - em->start)); free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); + ret = can_nocow_extent(inode, start, &len, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { @@ -10860,52 +10062,23 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en */ struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) { - struct rb_node *node; - struct rb_node *prev; struct btrfs_inode *inode; + unsigned long from = min_ino; - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - inode = rb_entry(node, struct btrfs_inode, rb_node); - if (min_ino < btrfs_ino(inode)) - node = node->rb_left; - else if (min_ino > btrfs_ino(inode)) - node = node->rb_right; - else + xa_lock(&root->inodes); + while (true) { + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + if (igrab(&inode->vfs_inode)) break; - } - - if (!node) { - while (prev) { - inode = rb_entry(prev, struct btrfs_inode, rb_node); - if (min_ino <= btrfs_ino(inode)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - - while (node) { - inode = rb_entry(prev, struct btrfs_inode, rb_node); - if (igrab(&inode->vfs_inode)) { - spin_unlock(&root->inode_lock); - return inode; - } - - min_ino = btrfs_ino(inode) + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - node = rb_next(node); + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); } - spin_unlock(&root->inode_lock); + xa_unlock(&root->inodes); - return NULL; + return inode; } static const struct inode_operations btrfs_dir_inode_operations = { |