diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 2908 |
1 files changed, 651 insertions, 2257 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf4f19e80e2f..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -31,38 +31,27 @@ #include "block-group.h" #include "compression.h" -static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set btrfs_bioset; - -static inline bool extent_state_in_tree(const struct extent_state *state) -{ - return !RB_EMPTY_NODE(&state->rb_node); -} #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(states); -static DEFINE_SPINLOCK(leak_lock); - -static inline void btrfs_leak_debug_add(spinlock_t *lock, - struct list_head *new, - struct list_head *head) +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_add(new, head); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } -static inline void btrfs_leak_debug_del(spinlock_t *lock, - struct list_head *entry) +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_del(entry); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) @@ -91,53 +80,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } - -static inline void btrfs_extent_state_leak_debug_check(void) -{ - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", - state->start, state->end, state->state, - extent_state_in_tree(state), - refcount_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - } -} - -#define btrfs_debug_check_extent_io_range(tree, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) -static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct extent_io_tree *tree, u64 start, u64 end) -{ - struct inode *inode = tree->private_data; - u64 isize; - - if (!inode || !is_data_inode(inode)) - return; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} #else -#define btrfs_leak_debug_add(lock, new, head) do {} while (0) -#define btrfs_leak_debug_del(lock, entry) do {} while (0) -#define btrfs_extent_state_leak_debug_check() do {} while (0) -#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; - /* * Structure to record info about the bio being assembled, and other info like * how many bytes are there before stripe/ordered extent boundary. @@ -148,6 +95,7 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; + btrfs_bio_end_io_t end_io_func; }; struct extent_page_data { @@ -161,24 +109,6 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, u32 bits, - struct extent_changeset *changeset, - int set) -{ - int ret; - - if (!changeset) - return 0; - if (set && (state->state & bits) == bits) - return 0; - if (!set && (state->state & bits) == 0) - return 0; - changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; -} - static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; @@ -207,7 +137,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); - /* The bio is owned by the bi_end_io handler now */ + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -223,26 +153,15 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) if (ret) { ASSERT(ret < 0); - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - /* The bio is owned by the bi_end_io handler now */ + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ epd->bio_ctrl.bio = NULL; } else { submit_one_bio(&epd->bio_ctrl); } } -int __init extent_state_cache_init(void) -{ - extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - return 0; -} - -int __init extent_io_init(void) +int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, @@ -250,32 +169,10 @@ int __init extent_io_init(void) if (!extent_buffer_cache) return -ENOMEM; - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - goto free_buffer_cache; - - if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) - goto free_bioset; - return 0; - -free_bioset: - bioset_exit(&btrfs_bioset); - -free_buffer_cache: - kmem_cache_destroy(extent_buffer_cache); - extent_buffer_cache = NULL; - return -ENOMEM; -} - -void __cold extent_state_cache_exit(void) -{ - btrfs_extent_state_leak_debug_check(); - kmem_cache_destroy(extent_state_cache); } -void __cold extent_io_exit(void) +void __cold extent_buffer_free_cachep(void) { /* * Make sure all delayed rcu free are flushed before we @@ -283,1244 +180,6 @@ void __cold extent_io_exit(void) */ rcu_barrier(); kmem_cache_destroy(extent_buffer_cache); - bioset_exit(&btrfs_bioset); -} - -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; - -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) -{ - tree->fs_info = fs_info; - tree->state = RB_ROOT; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - tree->private_data = private_data; - tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); -} - -void extent_io_tree_release(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - - /* - * The given mask might be not appropriate for the slab allocator, - * drop the unsupported bits - */ - mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->failrec = NULL; - RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); - refcount_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (refcount_dec_and_test(&state->refs)) { - WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&leak_lock, &state->leak_list); - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -/** - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @node_ret: pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret: points to entry which would have been the parent of the entry, - * containing @offset - * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. - * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. - */ -static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***node_ret, - struct rb_node **parent_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct tree_entry *entry; - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - if (node_ret) - *node_ret = node; - if (parent_ret) - *parent_ret = prev; - - /* Search neighbors until we find the first one past the end */ - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - - return prev; -} - -/* - * Inexact rb-tree search, return the next entry if @offset is not found - */ -static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); -} - -/** - * Search offset in the tree or fill neighbor rbtree node pointers. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * - * Return a pointer to the entry that contains @offset byte address. If no - * such entry exists, then return NULL and fill @prev_ret and @next_ret. - * Otherwise return the found entry and other pointers are left untouched. - */ -static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, - u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - - ASSERT(prev_ret); - ASSERT(next_ret); - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - orig_prev = prev; - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; - - entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < entry->start) { - prev = rb_prev(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - - return NULL; -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 bits, - struct extent_changeset *changeset); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - struct rb_node **node; - struct rb_node *parent; - const u64 end = state->end; - - set_state_bits(tree, state, bits, changeset); - - node = &tree->state.rb_node; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (end < entry->start) { - node = &(*node)->rb_left; - } else if (end > entry->end) { - node = &(*node)->rb_right; - } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; - } - } - - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - - merge_state(tree, state); - return 0; -} - -/* - * Insert state to @tree to the location given by @node and @parent. - */ -static void insert_state_fast(struct extent_io_tree *tree, - struct extent_state *state, struct rb_node **node, - struct rb_node *parent, unsigned bits, - struct extent_changeset *changeset) -{ - set_state_bits(tree, state, bits, changeset); - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *parent = NULL; - struct rb_node **node; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_split_delalloc_extent(tree->private_data, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - parent = &orig->rb_node; - node = &parent; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (prealloc->end < entry->start) { - node = &(*node)->rb_left; - } else if (prealloc->end > entry->end) { - node = &(*node)->rb_right; - } else { - free_extent_state(prealloc); - return -EEXIST; - } - } - - rb_link_node(&prealloc->rb_node, parent, node); - rb_insert_color(&prealloc->rb_node, &tree->state); - - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up anyone waiting on this state (wake == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, int wake, - struct extent_changeset *changeset) -{ - struct extent_state *next; - u32 bits_to_clear = bits & ~EXTENT_CTLBITS; - int ret; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); - - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); - - if (bits & EXTENT_DELALLOC) - bits |= EXTENT_NORESERVE; - - if (delete) - bits |= ~EXTENT_CTLBITS; - - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && extent_state_in_tree(cached) && - cached->start <= start && cached->end > start) { - if (clear) - refcount_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); - goto next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, bits, wake, changeset); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, bits, wake, changeset); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - struct extent_state *state; - struct rb_node *node; - - btrfs_debug_check_extent_io_range(tree, start, end); - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); -process_node: - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - refcount_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (!cond_resched_lock(&tree->lock)) { - node = rb_next(node); - goto process_node; - } - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - u32 bits_to_set = bits & ~EXTENT_CTLBITS; - int ret; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_set_delalloc_extent(tree->private_data, state, bits); - - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); - state->state |= bits_to_set; -} - -static void cache_state_if_flags(struct extent_state *state, - struct extent_state **cached_ptr, - unsigned flags) -{ - if (cached_ptr && !(*cached_ptr)) { - if (!flags || (state->state & flags)) { - *cached_ptr = state; - refcount_inc(&state->refs); - } - } -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - u32 exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - - if (exclusive_bits) - ASSERT(failed_start); - else - ASSERT(failed_start == NULL); -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, changeset); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - /* - * If this extent already has all the bits we want set, then - * skip it, not necessary to split it or do anything with it. - */ - if ((state->state & bits) == bits) { - start = state->end + 1; - cache_state(state, cached_state); - goto search_again; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, changeset); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -} - -/** - * convert_extent_bit - convert all bits in a given range from one bit to - * another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @cached_state: state that we're going to cache - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - * - * All allocations are done with GFP_NOFS. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - bool first_iteration = true; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, - clear_bits); - -again: - if (!prealloc) { - /* - * Best effort, don't worry if extent state allocation fails - * here for the first iteration. We might have a cached state - * that matches exactly the target range, in which case no - * extent state allocations are needed. We'll only know this - * after locking the tree. - */ - prealloc = alloc_extent_state(GFP_NOFS); - if (!prealloc && !first_iteration) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, NULL); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, NULL); - cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - cond_resched(); - first_iteration = false; - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; -} - -/* wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. - */ - BUG_ON(bits & EXTENT_LOCKED); - - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); -} - -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); -} - -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, GFP_NOFS, NULL); -} - -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * Don't support EXTENT_LOCKED case, same reason as - * set_record_extent_bits(). - */ - BUG_ON(bits & EXTENT_LOCKED); - - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, - changeset); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) -{ - int err; - u64 failed_start; - - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL); - return 0; - } - return 1; } void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) @@ -1554,295 +213,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) } } -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * Find the first offset in the io tree with one or more @bits set. - * - * Note: If there are multiple bits set in @bits, any of them will match. - * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->end == start - 1 && extent_state_in_tree(state)) { - while ((state = next_state(state)) != NULL) { - if (state->state & bits) - goto got_it; - } - free_extent_state(*cached_state); - *cached_state = NULL; - goto out; - } - free_extent_state(*cached_state); - *cached_state = NULL; - } - - state = find_first_extent_bit_state(tree, start, bits); -got_it: - if (state) { - cache_state_if_flags(state, cached_state, 0); - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } -out: - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find a contiguous area of bits - * - * @tree: io tree to check - * @start: offset to start the search from - * @start_ret: the first offset we found with the bits set - * @end_ret: the final contiguous range of the bits that were set - * @bits: bits to look for - * - * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges - * to set bits appropriately, and then merge them again. During this time it - * will drop the tree->lock, so use this helper if you want to find the actual - * contiguous area for given bits. We will search to the first bit we find, and - * then walk down the tree until we find a non-contiguous area. The area - * returned will be the full contiguous area with the bits set. - */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - while ((state = next_state(state)) != NULL) { - if (state->start > (*end_ret + 1)) - break; - *end_ret = state->end; - } - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find the first range that has @bits not set. This range could start before - * @start. - * - * @tree: the tree to search - * @start: offset at/after which the found extent should start - * @start_ret: records the beginning of the range - * @end_ret: records the end of the range (inclusive) - * @bits: the set of bits which must be unset - * - * Since unallocated range is also considered one which doesn't have the bits - * set it's possible that @end_ret contains -1, this happens in case the range - * spans (last_range_end, end of device]. In this case it's up to the caller to - * trim @end_ret to the appropriate size. - */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - struct rb_node *node, *prev = NULL, *next; - - spin_lock(&tree->lock); - - /* Find first extent with bits cleared */ - while (1) { - node = tree_search_prev_next(tree, start, &prev, &next); - if (!node && !next && !prev) { - /* - * Tree is completely empty, send full range and let - * caller deal with it - */ - *start_ret = 0; - *end_ret = -1; - goto out; - } else if (!node && !next) { - /* - * We are past the last allocated chunk, set start at - * the end of the last extent. - */ - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } else if (!node) { - node = next; - } - /* - * At this point 'node' either contains 'start' or start is - * before 'node' - */ - state = rb_entry(node, struct extent_state, rb_node); - - if (in_range(start, state->start, state->end - state->start + 1)) { - if (state->state & bits) { - /* - * |--range with bits sets--| - * | - * start - */ - start = state->end + 1; - } else { - /* - * 'start' falls within a range that doesn't - * have the bits set, so take its start as - * the beginning of the desired range - * - * |--range with bits cleared----| - * | - * start - */ - *start_ret = state->start; - break; - } - } else { - /* - * |---prev range---|---hole/unset---|---node range---| - * | - * start - * - * or - * - * |---hole/unset--||--first node--| - * 0 | - * start - */ - if (prev) { - state = rb_entry(prev, struct extent_state, - rb_node); - *start_ret = state->end + 1; - } else { - *start_ret = 0; - } - break; - } - } - - /* - * Find the longest stretch from start until an entry which has the - * bits set - */ - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && !(state->state & bits)) { - *end_ret = state->end; - } else { - *end_ret = state->start - 1; - break; - } - - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * true is returned if we find something, false if nothing was in the tree - */ -bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, - u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - bool found = false; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - refcount_inc(&state->refs); - } - found = true; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return found; -} - /* * Process one page for __process_pages_contig(). * @@ -1900,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1911,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1930,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: @@ -2094,14 +461,14 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state); + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -2118,210 +485,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops, NULL); } -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) +static int insert_failrec(struct btrfs_inode *inode, + struct io_failure_record *failrec) { - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; - - if (WARN_ON(search_end <= cur_start)) - return 0; + struct rb_node *exist; - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; + spin_lock(&inode->io_failure_lock); + exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, + &failrec->rb_node); + spin_unlock(&inode->io_failure_lock); - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; + return (exist == NULL) ? 0 : -EEXIST; } -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) { struct rb_node *node; - struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec = ERR_PTR(-ENOENT); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->failrec = failrec; -out: - spin_unlock(&tree->lock); - return ret; -} - -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) -{ - struct rb_node *node; - struct extent_state *state; - struct io_failure_record *failrec; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - - failrec = state->failrec; -out: - spin_unlock(&tree->lock); + spin_lock(&inode->io_failure_lock); + node = rb_simple_search(&inode->io_failure_tree, start); + if (node) + failrec = rb_entry(node, struct io_failure_record, rb_node); + spin_unlock(&inode->io_failure_lock); return failrec; } -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) -{ - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && extent_state_in_tree(cached) && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec) +static void free_io_failure(struct btrfs_inode *inode, + struct io_failure_record *rec) { - int ret; - int err = 0; - - set_state_failrec(failure_tree, rec->start, NULL); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret) - err = ret; - - ret = clear_extent_bits(io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED); - if (ret && !err) - err = ret; + spin_lock(&inode->io_failure_lock); + rb_erase(&rec->rb_node, &inode->io_failure_tree); + spin_unlock(&inode->io_failure_lock); kfree(rec); - return err; } /* @@ -2456,24 +659,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset) +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset) { - u64 private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 ino = btrfs_ino(inode); + u64 locked_start, locked_end; struct io_failure_record *failrec; - struct extent_state *state; int mirror; int ret; - private = 0; - ret = count_range_bits(failure_tree, &private, (u64)-1, 1, - EXTENT_DIRTY, 0); - if (!ret) - return 0; - - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(inode, start); if (IS_ERR(failrec)) return 0; @@ -2482,14 +679,10 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (sb_rdonly(fs_info->sb)) goto out; - spin_lock(&io_tree->lock); - state = find_first_extent_bit_state(io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&io_tree->lock); - - if (!state || state->start > failrec->start || - state->end < failrec->start + failrec->len - 1) + ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, + &locked_end, EXTENT_LOCKED, NULL); + if (ret || locked_start > failrec->bytenr || + locked_end < failrec->bytenr + failrec->len - 1) goto out; mirror = failrec->this_mirror; @@ -2500,7 +693,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, } while (mirror != failrec->failed_mirror); out: - free_io_failure(failure_tree, io_tree, failrec); + free_io_failure(inode, failrec); return 0; } @@ -2512,30 +705,26 @@ out: */ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; - struct extent_state *state, *next; + struct rb_node *node, *next; - if (RB_EMPTY_ROOT(&failure_tree->state)) + if (RB_EMPTY_ROOT(&inode->io_failure_tree)) return; - spin_lock(&failure_tree->lock); - state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); - while (state) { - if (state->start > end) + spin_lock(&inode->io_failure_lock); + node = rb_simple_search_first(&inode->io_failure_tree, start); + while (node) { + failrec = rb_entry(node, struct io_failure_record, rb_node); + if (failrec->bytenr > end) break; - ASSERT(state->end <= end); - - next = next_state(state); - - failrec = state->failrec; - free_extent_state(state); + next = rb_next(node); + rb_erase(&failrec->rb_node, &inode->io_failure_tree); kfree(failrec); - state = next; + node = next; } - spin_unlock(&failure_tree->lock); + spin_unlock(&inode->io_failure_lock); } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, @@ -2545,16 +734,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(BTRFS_I(inode), start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->start, failrec->len); + failrec->logical, failrec->bytenr, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2569,7 +756,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode if (!failrec) return ERR_PTR(-ENOMEM); - failrec->start = start; + RB_CLEAR_NODE(&failrec->rb_node); + failrec->bytenr = start; failrec->len = sectorsize; failrec->failed_mirror = bbio->mirror_num; failrec->this_mirror = bbio->mirror_num; @@ -2594,14 +782,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode } /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) { - ret = set_state_failrec(failure_tree, start, failrec); - /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, start + sectorsize - 1, - EXTENT_DAMAGED); - } else if (ret < 0) { + ret = insert_failrec(BTRFS_I(inode), failrec); + if (ret) { kfree(failrec); return ERR_PTR(ret); } @@ -2616,8 +798,6 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -2646,17 +826,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(failure_tree, tree, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } - repair_bio = btrfs_bio_alloc(1); + repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, + failed_bbio->private); repair_bbio = btrfs_bio(repair_bio); repair_bbio->file_offset = start; - repair_bio->bi_opf = REQ_OP_READ; - repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - repair_bio->bi_private = failed_bio->bi_private; if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; @@ -2720,8 +898,8 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) if (uptodate) set_extent_uptodate(&inode->io_tree, offset, offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&inode->io_tree, offset, - offset + sectorsize - 1, &cached); + unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); } static void submit_data_read_repair(struct inode *inode, @@ -2823,8 +1001,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio) +static void end_bio_extent_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; @@ -2924,11 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - if (processed->uptodate && tree->track_uptodate) - set_extent_uptodate(tree, processed->start, processed->end, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, processed->start, processed->end, - &cached); + unlock_extent_atomic(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -2988,11 +1163,10 @@ static struct extent_buffer *find_extent_buffer_readpage( * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio) +static void end_bio_extent_readpage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct btrfs_bio *bbio = btrfs_bio(bio); - struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* * The offset to the beginning of a bio, since one bio can never be @@ -3019,8 +1193,6 @@ static void end_bio_extent_readpage(struct bio *bio) "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; /* * We always issue full-sector reads, but if some block in a @@ -3061,9 +1233,7 @@ static void end_bio_extent_readpage(struct bio *bio) loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, page, - btrfs_ino(BTRFS_I(inode)), 0); + btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); /* * Zero out the remaining part if this range straddles @@ -3162,50 +1332,6 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/* - * Initialize the members up to but not including 'bio'. Use after allocating a - * new bio by bio_alloc_bioset as it does not initialize the bytes outside of - * 'bio' because use of __GFP_ZERO is not supported. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -} - -/* - * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. - * - * The bio allocation is backed by bioset and does not fail. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio)); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - /* this will never fail when it's backed by a bioset */ - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - ASSERT(bio); - - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - /** * Attempt to add a page to bio * @@ -3351,7 +1477,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, blk_opf_t opf, - bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, enum btrfs_compression_type compress_type) { @@ -3359,7 +1484,9 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + ASSERT(bio_ctrl->end_io_func); + + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -3370,8 +1497,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - bio->bi_end_io = end_io_func; - bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; @@ -3410,31 +1535,30 @@ static int alloc_new_bio(struct btrfs_inode *inode, return 0; error: bio_ctrl->bio = NULL; - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return ret; } /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting - * @page: page to add to the bio * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @bio_ret: must be valid pointer, newly allocated bio will be stored there - * @end_io_func: end_io callback for new bio - * @mirror_num: desired mirror to read/write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @compress_type: compress type for current bio + * + * The will either add the page into the existing @bio_ctrl->bio, or allocate a + * new one in @bio_ctrl->bio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. */ static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset, - bio_end_io_t end_io_func, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3446,6 +1570,9 @@ static int submit_extent_page(blk_opf_t opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); + + ASSERT(bio_ctrl->end_io_func); + if (force_bio_submit) submit_one_bio(bio_ctrl); @@ -3456,7 +1583,7 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, + disk_bytenr, offset, page_offset(page) + cur, compress_type); if (ret < 0) @@ -3613,7 +1740,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; - u64 cur_end; struct extent_map *em; int ret = 0; size_t pg_offset = 0; @@ -3623,7 +1749,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { - unlock_extent(tree, start, end); + unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); goto out; @@ -3637,6 +1763,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } + bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { unsigned long this_bio_flag = 0; @@ -3651,15 +1778,14 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, pg_offset, iosize); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); ret = PTR_ERR(em); break; @@ -3672,7 +1798,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; @@ -3735,43 +1860,31 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, page, disk_bytenr, iosize, - pg_offset, end_bio_extent_readpage, - this_bio_flag, force_bio_submit); + bio_ctrl, disk_bytenr, page, iosize, + pg_offset, this_bio_flag, + force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page * will never be unlocked. */ - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); goto out; } @@ -3984,6 +2097,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; + epd->bio_ctrl.end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -4077,10 +2191,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, page, - disk_bytenr, iosize, + &epd->bio_ctrl, disk_bytenr, + page, iosize, cur - page_offset(page), - end_bio_extent_writepage, 0, false); if (ret) { has_error = true; @@ -4431,8 +2544,9 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct bio *bio) +static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -4488,8 +2602,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) bio_put(bio); } -static void end_bio_extent_buffer_writepage(struct bio *bio) +static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; struct extent_buffer *eb; int done; @@ -4571,10 +2686,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); + epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, false); + &epd->bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4605,6 +2721,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); + epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4612,10 +2730,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, p, disk_bytenr, - PAGE_SIZE, 0, - end_bio_extent_buffer_writepage, - 0, false); + &epd->bio_ctrl, disk_bytenr, p, + PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -5236,7 +3352,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, &cached_state); + lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -5244,7 +3360,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent_cached(tree, start, end, &cached_state); + unlock_extent(tree, start, end, &cached_state); return 0; } @@ -5263,15 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree, if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + /* * At this point we can safely clear everything except the * locked bit, the nodatasum bit and the delalloc new bit. * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), - 0, 0, NULL, mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, + mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -5370,42 +3488,6 @@ next: } /* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, - u64 offset, u64 last) -{ - u64 sectorsize = btrfs_inode_sectorsize(inode); - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while (1) { - len = last - offset; - if (len == 0) - break; - len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR(em)) - return em; - - /* if this isn't a hole return it */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -/* * To cache previous fiemap extent * * Will be used for merging fiemap extent @@ -5434,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, { int ret = 0; + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + if (!cache->cached) goto assign; @@ -5457,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * - * 3) Share same flags except FIEMAP_EXTENT_LAST - * So regular extent won't get merged with prealloc extent + * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && - (cache->flags & ~FIEMAP_EXTENT_LAST) == - (flags & ~FIEMAP_EXTENT_LAST)) { + cache->flags == flags) { cache->len += len; - cache->flags |= flags; - goto try_submit_last; + return 0; } /* Not mergeable, need to submit cached one */ @@ -5481,13 +3563,8 @@ assign: cache->phys = phys; cache->len = len; cache->flags = flags; -try_submit_last: - if (cache->flags & FIEMAP_EXTENT_LAST) { - ret = fiemap_fill_next_extent(fieinfo, cache->offset, - cache->phys, cache->len, cache->flags); - cache->cached = false; - } - return ret; + + return 0; } /* @@ -5517,215 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - int ret = 0; - u64 off; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(&inode->vfs_inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_root *root = inode->root; - struct fiemap_cache cache = { 0 }; - struct ulist *roots; - struct ulist *tmp_ulist; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; - if (len == 0) - return -EINVAL; + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; - path = btrfs_alloc_path(); - if (!path) + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) return -ENOMEM; - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!roots || !tmp_ulist) { - ret = -ENOMEM; - goto out_free_ulist; + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; } /* - * We can't initialize that to 'start' as this could miss extents due - * to extent item merging + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. */ - off = 0; - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct btrfs_backref_shared_cache *backref_cache, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp_ulist, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); - if (ret < 0) { - goto out_free_ulist; - } else { - WARN_ON(!ret); - if (ret == 1) - ret = 0; - } + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = found_key.type; - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. */ - last = found_key.offset; - last_for_get_extent = last + 1; + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); } - btrfs_release_path(path); /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; } - lock_extent_bits(&inode->io_tree, start, start + len - 1, - &cached_state); + return 0; +} - em = get_extent_skip_holes(inode, start, last_for_get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_shared_cache *backref_cache; + struct ulist *roots; + struct ulist *tmp_ulist; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + path = btrfs_alloc_path(); + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!backref_cache || !path || !roots || !tmp_ulist) { + ret = -ENOMEM; goto out; } - while (!end) { - u64 offset_in_extent = 0; + lockstart = round_down(start, root->fs_info->sectorsize); + lockend = round_up(start + len, root->fs_info->sectorsize); + prev_extent_end = lockstart; - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { /* - * record the offset from the start of the extent - * for adjusting the disk offset below. Only do this if the - * extent isn't compressed since our in ram offset may be past - * what we have actually allocated on disk. + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. */ - if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - flags = 0; - if (em->block_start < EXTENT_MAP_LAST_BYTE) - disko = em->block_start + offset_in_extent; - else - disko = 0; + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); /* - * bump off for our next call to get_extent + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else if (fieinfo->fi_extents_max) { - u64 bytenr = em->block_start - - (em->start - em->orig_start); + if (extent_end <= lockstart) + goto next_item; - /* - * As btrfs supports shared space, this information - * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 - * then we're just getting a count and we can skip the - * lookup stuff. - */ - ret = btrfs_check_shared(root, btrfs_ino(inode), - bytenr, roots, tmp_ulist); - if (ret < 0) - goto out_free; - if (ret) - flags |= FIEMAP_EXTENT_SHARED; - ret = 0; + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, + disk_bytenr, extent_offset, + extent_gen, roots, tmp_ulist, + key.offset, extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(root, ino, + disk_bytenr, + extent_gen, + roots, + tmp_ulist, + backref_cache); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); } - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; } - ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, - em_len, flags); - if (ret) { - if (ret == 1) - ret = 0; - goto out_free; + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; } + cond_resched(); } -out_free: - if (!ret) - ret = emit_last_fiemap_cache(fieinfo, &cache); - free_extent_map(em); -out: - unlock_extent_cached(&inode->io_tree, start, start + len - 1, - &cached_state); -out_free_ulist: +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + 0, 0, 0, roots, tmp_ulist, + prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); +out: + kfree(backref_cache); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); @@ -5856,7 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); __free_extent_buffer(eb); } @@ -5873,8 +4269,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; init_rwsem(&eb->lock); - btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, - &fs_info->allocated_ebs); + btrfs_leak_debug_add_eb(eb); INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); @@ -6342,7 +4737,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -6362,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb) void free_extent_buffer(struct extent_buffer *eb) { int refs; - int old; if (!eb) return; + refs = atomic_read(&eb->refs); while (1) { - refs = atomic_read(&eb->refs); if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs == 1)) break; - old = atomic_cmpxchg(&eb->refs, refs, refs - 1); - if (old == refs) + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) return; } @@ -6569,7 +4962,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); if (ret < 0) return ret; } @@ -6579,7 +4972,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); return ret; } @@ -6587,13 +4980,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, eb->read_mirror = 0; atomic_set(&eb->io_pages, 1); check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, - page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_extent_readpage, 0, true); + eb->start, page, eb->len, + eb->start - page_offset(page), 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6684,6 +5078,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -6696,9 +5091,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ, NULL, - &bio_ctrl, page, page_offset(page), - PAGE_SIZE, 0, end_bio_extent_readpage, - 0, false); + &bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0, 0, false); if (err) { /* * We failed to submit the bio so it's the |