diff options
author | Nick Terrell <terrelln@fb.com> | 2022-12-13 16:21:55 -0800 |
---|---|---|
committer | Nick Terrell <terrelln@fb.com> | 2022-12-13 16:21:55 -0800 |
commit | 4f2c0a4acffbec01079c28f839422e64ddeff004 (patch) | |
tree | 06ada4a8a6d94a94c93944806041b8c994cebfc5 /fs/btrfs | |
parent | 88a309465b3f05a100c3b81966982c0f9f5d23a6 (diff) | |
parent | 830b3c68c1fb1e9176028d02ef86f3cf76aa2476 (diff) | |
download | lwn-4f2c0a4acffbec01079c28f839422e64ddeff004.tar.gz lwn-4f2c0a4acffbec01079c28f839422e64ddeff004.zip |
Merge branch 'main' into zstd-linus
Diffstat (limited to 'fs/btrfs')
91 files changed, 19054 insertions, 12918 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 4188ba3fd8c3..fa9ddcc9eb0b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags) subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value obj-$(CONFIG_BTRFS_FS) := btrfs.o @@ -30,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o + subpage.o tree-mod-log.o extent-io-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0a0d0eccee4e..548d6a5477b4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type) +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; } - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (ret) - return ret; - - if (default_acl) { - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!ret) - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - - if (!default_acl && !acl) - cache_no_acl(inode); - return ret; -} diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 43c89952b7d2..aac240430efe 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -15,13 +15,12 @@ enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, - WORK_HIGH_PRIO_BIT, }; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) -struct __btrfs_workqueue { +struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ @@ -48,12 +47,7 @@ struct __btrfs_workqueue { spinlock_t thres_lock; }; -struct btrfs_workqueue { - struct __btrfs_workqueue *normal; - struct __btrfs_workqueue *high; -}; - -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } @@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* - * We could compare wq->normal->pending with num_online_cpus() + * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ - if (wq->normal->thresh == NO_THRESHOLD) + if (wq->thresh == NO_THRESHOLD) return false; - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; + return atomic_read(&wq->pending) > wq->thresh * 2; } -static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, - unsigned int flags, int limit_active, int thresh) +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, ret->thresh = thresh; } - if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, - ret->current_active, name); - else - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, - ret->current_active, name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); if (!ret->normal_wq) { kfree(ret); return NULL; @@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, INIT_LIST_HEAD(&ret->ordered_list); spin_lock_init(&ret->list_lock); spin_lock_init(&ret->thres_lock); - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); - return ret; -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - -struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, - const char *name, - unsigned int flags, - int limit_active, - int thresh) -{ - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); - - if (!ret) - return NULL; - - ret->normal = __btrfs_alloc_workqueue(fs_info, name, - flags & ~WQ_HIGHPRI, - limit_active, thresh); - if (!ret->normal) { - kfree(ret); - return NULL; - } - - if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, - limit_active, thresh); - if (!ret->high) { - __btrfs_destroy_workqueue(ret->normal); - kfree(ret); - return NULL; - } - } + trace_btrfs_workqueue_alloc(ret, name); return ret; } @@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ -static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; @@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ -static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; @@ -217,7 +173,7 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq, +static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; @@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq = work->wq; int need_order = 0; /* @@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work) */ if (work->ordered_func) need_order = 1; - wq = work->wq; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, work->flags = 0; } -static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, - struct btrfs_work *work) +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; @@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, queue_work(wq->normal_wq, &work->normal_work); } -void btrfs_queue_work(struct btrfs_workqueue *wq, - struct btrfs_work *work) -{ - struct __btrfs_workqueue *dest_wq; - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) - dest_wq = wq->high; - else - dest_wq = wq->normal; - __btrfs_queue_work(dest_wq, work); -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) -{ - destroy_workqueue(wq->normal_wq); - trace_btrfs_workqueue_destroy(wq); - kfree(wq); -} - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; - if (wq->high) - __btrfs_destroy_workqueue(wq->high); - __btrfs_destroy_workqueue(wq->normal); + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { - if (!wq) - return; - wq->normal->limit_active = limit_active; - if (wq->high) - wq->high->limit_active = limit_active; -} - -void btrfs_set_work_high_priority(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (wq) + wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { - if (wq->high) - flush_workqueue(wq->high->normal_wq); - - flush_workqueue(wq->normal->normal_wq); + flush_workqueue(wq->normal_wq); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 3204daa51b95..6e2596ddae10 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -11,11 +11,8 @@ struct btrfs_fs_info; struct btrfs_workqueue; -/* Internal use only */ -struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); -typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -25,7 +22,7 @@ struct btrfs_work { /* Don't touch things below */ struct work_struct normal_work; struct list_head ordered_list; - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq; unsigned long flags; }; @@ -40,9 +37,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); -void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); void btrfs_flush_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c9ee579bc5a6..18374a6d05bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -138,6 +138,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -288,8 +289,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -647,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -761,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -789,11 +808,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -818,16 +839,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -853,10 +869,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -882,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -918,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -1021,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1031,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -1120,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1335,7 +1369,8 @@ again: if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1351,6 +1386,12 @@ again: if (ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } ret = ulist_add_merge_ptr(refs, ref->parent, ref->inode_list, @@ -1376,6 +1417,14 @@ again: eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1392,24 +1441,6 @@ out: return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -1508,16 +1539,137 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } -/** - * Check if an extent is shared or not +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!cache->use_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &cache->entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + cache->entries[i].is_shared = true; + cache->entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!cache->use_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &cache->entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &cache->entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* + * Check if a data extent is shared or not. * - * @root: root inode belongs to - * @inum: inode number of the inode whose extent we are checking - * @bytenr: logical bytenr of the extent we are checking - * @roots: list of roots this extent is shared among - * @tmp: temporary list used for iteration + * @root: The root the inode belongs to. + * @inum: Number of the inode whose extent we are checking. + * @bytenr: Logical bytenr of the extent we are checking. + * @extent_gen: Generation of the extent (file extent item) or 0 if it is + * not known. + * @roots: List of roots this extent is shared among. + * @tmp: Temporary list used for iteration. + * @cache: A backref lookup result cache. * - * btrfs_check_shared uses the backref walking code but will short + * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is @@ -1528,8 +1680,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp) +int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp, + struct btrfs_backref_shared_cache *cache) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -1541,7 +1695,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; + int level; ulist_init(roots); ulist_init(tmp); @@ -1558,23 +1714,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, btrfs_get_tree_mod_seq(fs_info, &elem); } + /* -1 means we are in the bytenr of the data extent. */ + level = -1; ULIST_ITER_INIT(&uiter); + cache->use_cache = true; while (1) { + bool is_shared; + bool cached; + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, true); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; + /* + * If our data extent is not shared through reflinks and it was + * created in a generation after the last one used to create a + * snapshot of the inode's root, then it can not be shared + * indirectly through subtrees, as that can only happen with + * snapshots. In this case bail out, no need to check for the + * sharedness of extent buffers. + */ + if (level == -1 && + extent_gen > btrfs_root_last_snapshot(&root->root_item)) + break; + + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && tmp->nnodes > 1) + cache->use_cache = false; + + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, false); node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; + level++; + cached = lookup_backref_shared_cache(cache, root, bytenr, level, + &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } @@ -2025,10 +2231,29 @@ out: return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -2046,17 +2271,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2065,6 +2288,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2100,8 +2325,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2115,15 +2340,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2159,8 +2384,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2177,34 +2402,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2245,8 +2449,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ba454032dbe2..8e69584d538d 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -17,6 +17,21 @@ struct inode_fs_paths { struct btrfs_data_container *fspath; }; +struct btrfs_backref_shared_cache_entry { + u64 bytenr; + u64 gen; + bool is_shared; +}; + +struct btrfs_backref_shared_cache { + /* + * A path from a root to a leaf that has a file extent item pointing to + * a given data extent should never exceed the maximum b+tree height. + */ + struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; + bool use_cache; +}; + typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); @@ -35,8 +50,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, + struct btrfs_path *path, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); @@ -63,8 +77,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp_ulist); +int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp, + struct btrfs_backref_shared_cache *cache); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1db24e6d6d90..deebc8ddbd93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -159,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; + bool leftmost = true; ASSERT(block_group->length != 0); - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; @@ -172,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; + leftmost = false; } else { - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); - if (info->first_logical_byte > block_group->start) - info->first_logical_byte = block_group->start; - - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return 0; } @@ -201,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search( struct rb_node *n; u64 end, start; - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); @@ -224,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search( break; } } - if (ret) { + if (ret) btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->start) - info->first_logical_byte = ret->start; - } - spin_unlock(&info->block_group_cache_lock); + read_unlock(&info->block_group_cache_lock); return ret; } @@ -258,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group( struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; - spin_lock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -275,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group( btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } @@ -411,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, btrfs_put_caching_control(caching_ctl); } -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) +static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); + return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; +} + +static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; - int ret = 0; + int ret; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - - wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); btrfs_put_caching_control(caching_ctl); return ret; } -static bool space_cache_v1_done(struct btrfs_block_group *cache) -{ - bool ret; - - spin_lock(&cache->lock); - ret = cache->cached != BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - return ret; -} - -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl) -{ - wait_event(caching_ctl->wait, space_cache_v1_done(cache)); -} - #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { @@ -577,8 +593,6 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -602,9 +616,6 @@ next: key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -639,7 +650,6 @@ next: total_found += add_new_free_space(block_group, last, block_group->start + block_group->length); - caching_ctl->progress = (u64)-1; out: btrfs_free_path(path); @@ -709,8 +719,6 @@ done: } #endif - caching_ctl->progress = (u64)-1; - up_read(&fs_info->commit_root_sem); btrfs_free_excluded_extents(block_group); mutex_unlock(&caching_ctl->mutex); @@ -721,9 +729,8 @@ done: btrfs_put_block_group(block_group); } -int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) { - DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; @@ -740,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; - caching_ctl->progress = cache->start; refcount_set(&caching_ctl->count, 2); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); @@ -756,24 +762,20 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - if (btrfs_test_opt(fs_info, SPACE_CACHE)) - cache->cached = BTRFS_CACHE_FAST; - else - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; + cache->cached = BTRFS_CACHE_STARTED; spin_unlock(&cache->lock); - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: - if (load_cache_only && caching_ctl) - btrfs_wait_space_cache_v1_finished(cache, caching_ctl); + if (wait && caching_ctl) + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); if (caching_ctl) btrfs_put_caching_control(caching_ctl); @@ -948,17 +950,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret) goto out; - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - if (fs_info->first_logical_byte == block_group->start) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -978,32 +978,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } - if (block_group->has_caching_ctl) - caching_ctl = btrfs_get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - spin_lock(&fs_info->block_group_cache_lock); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - spin_unlock(&fs_info->block_group_cache_lock); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - btrfs_put_caching_control(caching_ctl); - btrfs_put_caching_control(caching_ctl); + + write_lock(&fs_info->block_group_cache_lock); + caching_ctl = btrfs_get_caching_control(block_group); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } } } + if (caching_ctl) + list_del_init(&caching_ctl->list); + write_unlock(&fs_info->block_group_cache_lock); + + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } spin_lock(&trans->transaction->dirty_bgs_lock); WARN_ON(!list_empty(&block_group->dirty_list)); @@ -1024,8 +1023,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); + WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags) && + block_group->space_info->active_total_bytes + < block_group->length); } block_group->space_info->total_bytes -= block_group->length; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -1054,7 +1059,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); - block_group->removed = 1; + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); + /* * At this point trimming or scrub can't start on this block group, * because we removed the block group from the rbtree @@ -1289,6 +1295,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (btrfs_fs_closing(fs_info)) + return; + /* * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. @@ -1358,6 +1367,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. @@ -1503,6 +1520,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1513,15 +1537,26 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (btrfs_fs_closing(fs_info)) + return; + + if (!btrfs_should_reclaim(fs_info)) return; + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); + return; + } + /* * Long running balances can keep us blocked here for eternity, so * simply skip reclaim if we're unable to get the mutex. */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1585,9 +1620,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret) + if (ret) { + btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); + } next: btrfs_put_block_group(bg); @@ -1596,6 +1633,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -1677,35 +1715,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = read_bg_from_eb(fs_info, &found_key, path); - break; + return read_bg_from_eb(fs_info, &found_key, path); } - - path->slots[0]++; } -out: return ret; } @@ -1787,11 +1803,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; } /* * The remaining case would be for RAID56, multiply by @@ -1872,16 +1887,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return 0; } -static void link_block_group(struct btrfs_block_group *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); -} - static struct btrfs_block_group *btrfs_create_block_group_cache( struct btrfs_fs_info *fs_info, u64 start) { @@ -1919,7 +1924,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + cache->full_stripe_locks_root.root = RB_ROOT; + mutex_init(&cache->full_stripe_locks_root.lock); return cache; } @@ -1984,7 +1990,6 @@ static int read_one_block_group(struct btrfs_fs_info *info, int need_clear) { struct btrfs_block_group *cache; - struct btrfs_space_info *space_info; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); int ret; @@ -1997,6 +2002,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); @@ -2059,11 +2065,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, /* Should not have any excluded extents. Just in case, though. */ btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, cache->start, cache->start + cache->length); @@ -2076,13 +2080,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, - cache->zone_unusable, &space_info); - - cache->space_info = space_info; - - link_block_group(cache); + btrfs_add_bg_to_space_info(info, cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { @@ -2106,7 +2104,6 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct btrfs_space_info *space_info; struct rb_node *node; int ret = 0; @@ -2126,7 +2123,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) /* Fill dummy cache as FULL */ bg->length = em->len; bg->flags = map->type; - bg->last_byte_to_unpin = (u64)-1; bg->cached = BTRFS_CACHE_FINISHED; bg->used = em->len; bg->flags = map->type; @@ -2147,10 +2143,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } - btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, &space_info); - bg->space_info = space_info; - link_block_group(bg); + btrfs_add_bg_to_space_info(fs_info, bg); set_avail_alloc_bits(fs_info, bg->flags); } @@ -2170,7 +2163,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!root) + /* + * Either no extent root (with ibadroots rescue option) or we have + * unsupported RO options. The fs can never be mounted read-write, so no + * need to waste time searching block group items. + * + * This also allows new extent tree related changes to be RO compat, + * no need for a full incompat flag. + */ + if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP)) return fill_dummy_bgs(info); key.objectid = 0; @@ -2279,7 +2281,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2405,7 +2407,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); - if (!block_group->chunk_item_inserted) { + if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + &block_group->runtime_flags)) { mutex_lock(&fs_info->chunk_mutex); ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); mutex_unlock(&fs_info->chunk_mutex); @@ -2435,6 +2438,27 @@ next: btrfs_trans_release_chunk_metadata(trans); } +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) @@ -2453,8 +2477,9 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; @@ -2464,12 +2489,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2482,14 +2501,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran btrfs_free_excluded_extents(cache); -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif /* * Ensure the corresponding space_info object is created and * assigned to our block group. We want our bg to be added to the rbtree @@ -2510,12 +2521,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, cache->zone_unusable, - &cache->space_info); + btrfs_add_bg_to_space_info(fs_info, cache); btrfs_update_global_block_rsv(fs_info); - link_block_group(cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + cache->space_info->bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif list_add_tail(&cache->bg_list, &trans->new_bgs); trans->delayed_ref_updates++; @@ -2544,6 +2560,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, int ret; bool dirty_bg_running; + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -2599,6 +2628,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; @@ -2671,7 +2708,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, cache->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); @@ -2811,7 +2848,7 @@ again: cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - cache_size); + cache_size, false); if (ret) goto out_put; @@ -2894,7 +2931,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2960,7 +2996,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3061,7 +3096,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3119,7 +3153,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3178,6 +3211,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } +static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, + u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3200,6 +3258,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + bool reclaim; + cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { ret = -ENOENT; @@ -3214,7 +3274,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, 1); + btrfs_cache_block_group(cache, true); byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); @@ -3245,6 +3305,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3271,6 +3333,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); @@ -3403,7 +3467,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3490,7 +3554,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3605,10 +3673,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3663,6 +3738,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); @@ -3698,9 +3774,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) @@ -3773,6 +3862,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, ret = PTR_ERR(bg); } else { /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + + /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore @@ -3847,35 +3944,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; - u64 last = 0; - while (1) { - struct inode *inode; + block_group = btrfs_lookup_first_block_group(info, 0); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, + &block_group->runtime_flags)) { + struct inode *inode = block_group->inode; - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - btrfs_wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; + block_group->inode = NULL; spin_unlock(&block_group->lock); - block_group = btrfs_next_block_group(block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->start + block_group->length; - btrfs_put_block_group(block_group); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + block_group = btrfs_next_block_group(block_group); } } @@ -3891,14 +3977,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { @@ -3928,14 +4014,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->zone_active_bgs_lock); - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); @@ -3958,9 +4044,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); @@ -3974,9 +4060,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); @@ -3998,7 +4097,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_lock(&block_group->lock); cleanup = (atomic_dec_and_test(&block_group->frozen) && - block_group->removed); + test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); spin_unlock(&block_group->lock); if (cleanup) { @@ -4019,7 +4118,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) * tasks trimming this block group have left 1 entry each one. * Free them if any. */ - __btrfs_remove_free_space_cache(block_group->free_space_ctl); + btrfs_remove_free_space_cache(block_group); } } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5878b7ce3b78..8fb14b99a1d1 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,33 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, +}; + +/* Block group flags set at runtime */ +enum btrfs_block_group_flags { + BLOCK_GROUP_FLAG_IREF, + BLOCK_GROUP_FLAG_REMOVED, + BLOCK_GROUP_FLAG_TO_COPY, + BLOCK_GROUP_FLAG_RELOCATING_REPAIR, + BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, }; struct btrfs_caching_control { @@ -48,13 +70,20 @@ struct btrfs_caching_control { wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; - u64 progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; @@ -68,6 +97,7 @@ struct btrfs_block_group { u64 bytes_super; u64 flags; u64 cache_generation; + u64 global_root_id; /* * If the free space extent count exceeds this number, convert the block @@ -90,22 +120,15 @@ struct btrfs_block_group { /* For raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; + unsigned long runtime_flags; unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - unsigned int to_copy:1; - unsigned int relocating_repair:1; - unsigned int chunk_item_inserted:1; - unsigned int zone_is_active:1; int disk_cache_state; /* Cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; struct btrfs_space_info *space_info; @@ -207,6 +230,8 @@ struct btrfs_block_group { u64 meta_write_pointer; struct map_lookup *physical_map; struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -249,14 +274,13 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache); -int btrfs_cache_block_group(struct btrfs_block_group *cache, - int load_cache_only); +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); @@ -298,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, struct block_device *bdev, u64 physical, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b3ee49b0b1e8..ec96285357e0 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; + block_rsv->full = true; } else { num_bytes = 0; } @@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, bytes_to_add = min(num_bytes, bytes_to_add); dest->reserved += bytes_to_add; if (dest->reserved >= dest->size) - dest->full = 1; + dest->full = true; num_bytes -= bytes_to_add; } spin_unlock(&dest->lock); @@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); @@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type) + enum btrfs_rsv_type type) { btrfs_init_block_rsv(rsv, type); rsv->space_info = btrfs_find_space_info(fs_info, @@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) + enum btrfs_rsv_type type) { struct btrfs_block_rsv *block_rsv; @@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, */ if (block_rsv == delayed_rsv) target = global_rsv; - else if (block_rsv != global_rsv && !delayed_rsv->full) + else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; if (target && block_rsv->space_info != target->space_info) @@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) if (block_rsv->reserved >= num_bytes) { block_rsv->reserved -= num_bytes; if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; + block_rsv->full = false; ret = 0; } spin_unlock(&block_rsv->lock); @@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, if (update_size) block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; + block_rsv->full = true; spin_unlock(&block_rsv->lock); } @@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, } global_rsv->reserved -= num_bytes; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); btrfs_block_rsv_add_bytes(dest, num_bytes, true); @@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) btrfs_try_granting_tickets(fs_info, sinfo); } - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; + block_rsv->full = (block_rsv->reserved == block_rsv->size); if (block_rsv->size >= sinfo->total_bytes) sinfo->force_alloc = CHUNK_ALLOC_FORCE; @@ -427,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 3b67ff08d434..578c3497a455 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum; /* * Types of block reserves */ -enum { +enum btrfs_rsv_type { BTRFS_BLOCK_RSV_GLOBAL, BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, @@ -25,9 +25,10 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; + bool full; + bool failfast; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; /* * Qgroup equivalent for @size @reserved @@ -49,13 +50,13 @@ struct btrfs_block_rsv { u64 qgroup_rsv_reserved; }; -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, @@ -91,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } +/* + * Fast path to check if the reserve is full, may be carefully used outside of + * locks. + */ +static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) +{ + return data_race(rsv->full); +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b3e46aabc3d8..54c2ccb36b61 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -14,6 +14,13 @@ #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -58,6 +65,8 @@ enum { * on the same file. */ BTRFS_INODE_VERITY_IN_PROGRESS, + /* Set when this inode is a free space inode. */ + BTRFS_INODE_FREE_SPACE_INODE, }; /* in memory btrfs inode */ @@ -87,7 +96,8 @@ struct btrfs_inode { /* special utility tree used to record which mirrors have already been * tried when checksums fail for a given block */ - struct extent_io_tree io_failure_tree; + struct rb_root io_failure_tree; + spinlock_t io_failure_lock; /* * Keep track of where the inode has extent items mapped in order to @@ -173,8 +183,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -242,11 +253,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) -{ - return inode->root->fs_info->sectorsize; -} - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -264,26 +270,31 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, return (unsigned long)h; } -static inline void btrfs_insert_inode_hash(struct inode *inode) -{ - unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); - - __insert_inode_hash(inode, h); -} +#if BITS_PER_LONG == 32 +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); @@ -292,14 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - - if (root == root->fs_info->tree_root && - btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) - return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; + return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(struct inode *inode) @@ -333,6 +337,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) spin_unlock(&inode->lock); } +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; @@ -346,30 +380,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } -struct btrfs_dio_private { - struct inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ - u64 file_offset; - u64 disk_bytenr; - /* Used for bio::bi_size */ - u32 bytes; - - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* Array of checksums */ - u8 csums[]; -}; +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..98c6e5feab19 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> @@ -153,7 +152,7 @@ struct btrfsic_block { struct btrfsic_block *next_in_same_bio; void *orig_bio_private; bio_end_io_t *orig_bio_end_io; - int submit_bio_bh_rw; + blk_opf_t submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -1553,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - for (i = 0; i < num_pages; i++) { - block_ctx->pagev[i] = alloc_page(GFP_NOFS); - if (!block_ctx->pagev[i]) - return -1; - } + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; dev_bytenr = block_ctx->dev_bytenr; for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - bio = btrfs_bio_alloc(num_pages - i); - bio_set_dev(bio, block_ctx->dev->bdev); + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1685,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw) + blk_opf_t submit_bio_bh_rw) { int is_metadata; struct btrfsic_block *block; @@ -2034,7 +2030,7 @@ continue_loop: static void btrfsic_bio_end_io(struct bio *bp) { - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + struct btrfsic_block *block = bp->bi_private; int iodone_w_error; /* mutex is not held! This is not save if IO is not yet completed @@ -2636,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -static void __btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - struct btrfsic_dev_state *dev_state; + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - if (!btrfsic_is_initialized) + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) return; - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - int i = 0; - u64 dev_bytenr; - u64 cur_bytenr; - struct bio_vec bvec; - struct bvec_iter iter; - int bio_is_patched; - char **mapped_datav; - unsigned int segs = bio_segments(bio); - - dev_bytenr = 512 * bio->bi_iter.bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, - sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - goto leave; - cur_bytenr = dev_bytenr; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, segs, - bio, &bio_is_patched, - bio->bi_opf); - kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } -leave: - mutex_unlock(&btrfsic_mutex); + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); } -void btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - __btrfsic_submit_bio(bio); - submit_bio(bio); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } } -int btrfsic_submit_bio_wait(struct bio *bio) +void btrfsic_check_bio(struct bio *bio) { - __btrfsic_submit_bio(bio); - return submit_bio_wait(bio); + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); } int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index bcc730a06cb5..e4c8aed7996f 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_submit_bio(struct bio *bio); -int btrfsic_submit_bio_wait(struct bio *bio); +void btrfsic_check_bio(struct bio *bio); #else -#define btrfsic_submit_bio submit_bio -#define btrfsic_submit_bio_wait submit_bio_wait +static inline void btrfsic_check_bio(struct bio *bio) { } #endif int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 71e5b2e9a1ba..e6635fe70067 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/time.h> @@ -15,6 +16,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> @@ -136,109 +138,14 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, - unsigned long disk_size) -{ - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; -} - -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, - u64 disk_start) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u32 csum_size = fs_info->csum_size; - const u32 sectorsize = fs_info->sectorsize; - struct page *page; - unsigned int i; - char *kaddr; - u8 csum[BTRFS_CSUM_SIZE]; - struct compressed_bio *cb = bio->bi_private; - u8 *cb_sum = cb->sums; - - if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return 0; - - shash->tfm = fs_info->csum_shash; - - for (i = 0; i < cb->nr_pages; i++) { - u32 pg_offset; - u32 bytes_left = PAGE_SIZE; - page = cb->compressed_pages[i]; - - /* Determine the remaining bytes inside the page first */ - if (i == cb->nr_pages - 1) - bytes_left = cb->compressed_len - i * PAGE_SIZE; - - /* Hash through the page sector by sector */ - for (pg_offset = 0; pg_offset < bytes_left; - pg_offset += sectorsize) { - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr + pg_offset, - sectorsize, csum); - kunmap_atomic(kaddr); - - if (memcmp(&csum, cb_sum, csum_size) != 0) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; - } - cb_sum += csum_size; - disk_start += sectorsize; - } - } - return 0; -} - -/* - * Reduce bio and io accounting for a compressed_bio with its corresponding bio. - * - * Return true if there is no pending bio nor io. - * Return false otherwise. - */ -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - unsigned int bi_size = 0; - bool last_io = false; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * At endio time, bi_iter.bi_size doesn't represent the real bio size. - * Thus here we have to iterate through all segments to grab correct - * bio size. - */ - bio_for_each_segment_all(bvec, bio, iter_all) - bi_size += bvec->bv_len; - - if (bio->bi_status) - cb->errors = 1; - - ASSERT(bi_size && bi_size <= cb->compressed_len); - last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, - &cb->pending_sectors); - /* - * Here we must wake up the possible error handler after all other - * operations on @cb finished, or we can race with - * finish_compressed_bio_*() which may free @cb. - */ - wake_up_var(cb); - - return last_io; -} - -static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; + if (cb->status == BLK_STS_OK) + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + /* Release the compressed pages */ for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; @@ -247,87 +154,60 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi } /* Do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - ASSERT(bio); - ASSERT(!bio->bi_status); - /* - * We have verified the checksum already, set page checked so - * the end_io handlers know about it - */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { - u64 bvec_start = page_offset(bvec->bv_page) + - bvec->bv_offset; - - btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), - bvec->bv_page, bvec_start, - bvec->bv_len); - } - - bio_endio(cb->orig_bio); - } + btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); } -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context +/* + * Verify the checksums and kick off repair if needed on the uncompressed data + * before decompressing it into the original bio and freeing the uncompressed + * pages. */ -static void end_compressed_bio_read(struct bio *bio) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - unsigned int mirror = btrfs_bio(bio)->mirror_num; - int ret = 0; - - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - /* - * Record the correct mirror_num in cb->orig_bio so that - * read-repair can work properly. - */ - btrfs_bio(cb->orig_bio)->mirror_num = mirror; - cb->mirror_num = mirror; - - /* - * Some IO in this cb have failed, just skip checksum as there - * is no way it could be correct. - */ - if (cb->errors == 1) - goto csum_failed; - - inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), bio, - bio->bi_iter.bi_sector << 9); - if (ret) - goto csum_failed; + struct compressed_bio *cb = bbio->private; + struct inode *inode = cb->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *bi = BTRFS_I(inode); + bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + blk_status_t status = bbio->bio.bi_status; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (!status && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, + bv.bv_page, bv.bv_offset))) { + btrfs_clean_io_failure(bi, start, bv.bv_page, + bv.bv_offset); + } else { + int ret; + + refcount_inc(&cb->pending_ios); + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + btrfs_submit_data_read_bio); + if (ret) { + refcount_dec(&cb->pending_ios); + status = errno_to_blk_status(ret); + } + } + } - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_bio(cb); + if (status) + cb->status = status; -csum_failed: - if (ret) - cb->errors = 1; - finish_compressed_bio_read(cb, bio); -out: - bio_put(bio); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + btrfs_bio_free_csum(bbio); + bio_put(&bbio->bio); } /* @@ -340,32 +220,31 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; + const int errno = blk_status_to_errno(cb->status); int i; int ret; - if (cb->errors) - mapping_set_error(inode->i_mapping, -EIO); + if (errno) + mapping_set_error(inode->i_mapping, errno); + + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } for (i = 0; i < ret; i++) { - if (cb->errors) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + struct folio *folio = fbatch.folios[i]; + + if (errno) + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } @@ -381,9 +260,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) */ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, - !cb->errors); + cb->status == BLK_STS_OK); - end_compressed_writeback(inode, cb); + if (cb->writeback) + end_compressed_writeback(inode, cb); /* Note, our inode could be gone now */ /* @@ -402,6 +282,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) kfree(cb); } +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + finish_compressed_bio_write(cb); +} + /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -409,32 +297,20 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio) +static void end_compressed_bio_write(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; - - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + struct compressed_bio *cb = bbio->private; - finish_compressed_bio_write(cb); -out: - bio_put(bio); -} + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; -static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct compressed_bio *cb, - struct bio *bio, int mirror_num) -{ - blk_status_t ret; + if (refcount_dec_and_test(&cb->pending_ios)) { + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - ASSERT(bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - return ret; + btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + } + bio_put(&bbio->bio); } /* @@ -455,7 +331,8 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - unsigned int opf, bio_end_io_t endio_func, + blk_opf_t opf, + btrfs_bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -464,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bio->bi_opf = opf; - bio->bi_private = cb; - bio->bi_end_io = endio_func; em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); if (IS_ERR(em)) { @@ -487,7 +360,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte return ERR_PTR(ret); } *next_stripe_start = disk_bytenr + geom.len; - + refcount_inc(&cb->pending_ios); return bio; } @@ -505,35 +378,39 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css) + blk_opf_t write_flags, + struct cgroup_subsys_state *blkcg_css, + bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); - const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; + const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + refcount_set(&cb->pending_ios, 1); + cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; - cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; - cb->orig_bio = NULL; + cb->writeback = writeback; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -549,9 +426,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, &next_stripe_start); if (IS_ERR(bio)) { ret = errno_to_blk_status(PTR_ERR(bio)); - bio = NULL; - goto finish_cb; + break; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -591,42 +469,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); - if (ret) - goto finish_cb; + ret = btrfs_csum_one_bio(inode, bio, start, true); + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + break; + } } - ret = submit_compressed_bio(fs_info, cb, bio, 0); - if (ret) - goto finish_cb; + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, bio, 0); bio = NULL; } cond_resched(); } + if (blkcg_css) kthread_associate_blkcg(NULL); - return 0; - -finish_cb: - if (bio) { - bio->bi_status = ret; - bio_endio(bio); - } - /* Last byte of @cb is submitted, endio will free @cb */ - if (cur_disk_bytenr == disk_start + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_start + compressed_len - cur_disk_bytenr) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_write(cb); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_write(cb); return ret; } @@ -650,7 +511,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -719,6 +581,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (!*memstall && PageWorkingset(page)) { + psi_memstall_enter(pflags); + *memstall = 1; + } + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -727,7 +594,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; - lock_extent(tree, cur, page_end); + lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); @@ -741,7 +608,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -755,14 +622,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, int zeros; zeros = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, zeros); - flush_dcache_page(page); } } add_size = min(em->start + em->len, page_end + 1) - cur; ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); if (ret != add_size) { - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -791,15 +657,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - unsigned int nr_pages; - unsigned int pg_index; struct bio *comp_bio = NULL; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; @@ -808,9 +672,11 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - blk_status_t ret = BLK_STS_RESOURCE; - int faili = 0; - u8 *sums; + unsigned long pflags; + int memstall = 0; + blk_status_t ret; + int ret2; + int i; em_tree = &BTRFS_I(inode)->extent_tree; @@ -821,51 +687,49 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); - if (!em) - return BLK_STS_IOERR; + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); - if (!cb) + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); + if (!cb) { + ret = BLK_STS_RESOURCE; goto out; + } - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + refcount_set(&cb->pending_ios, 1); + cb->status = BLK_STS_OK; cb->inode = inode; - cb->mirror_num = mirror_num; - sums = cb->sums; cb->start = em->orig_start; em_len = em->len; em_start = em->start; - free_extent_map(em); - em = NULL; - cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); + cb->compress_type = em->compress_type; cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_NOFS); - if (!cb->compressed_pages) - goto fail1; - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); - if (!cb->compressed_pages[pg_index]) { - faili = pg_index - 1; - ret = BLK_STS_RESOURCE; - goto fail2; - } + free_extent_map(em); + em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; + goto fail; } - faili = nr_pages - 1; - cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb); + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto fail; + } + + add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -884,9 +748,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, REQ_OP_READ, end_compressed_bio_read, &next_stripe_start); if (IS_ERR(comp_bio)) { - ret = errno_to_blk_status(PTR_ERR(comp_bio)); - comp_bio = NULL; - goto finish_cb; + cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); + break; } } /* @@ -922,56 +785,51 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, submit = true; if (submit) { - unsigned int nr_sectors; + /* Save the original iter for read repair */ + if (bio_op(comp_bio) == REQ_OP_READ) + btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - if (ret) - goto finish_cb; + /* + * Save the initial offset of this chunk, as there + * is no direct correlation between compressed pages and + * the original file offset. The field is only used for + * priting error messages. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; - nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - sums += fs_info->csum_size * nr_sectors; + ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); + if (ret) { + btrfs_bio_end_io(btrfs_bio(comp_bio), ret); + break; + } - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); - if (ret) - goto finish_cb; + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; } } - return 0; -fail2: - while (faili >= 0) { - __free_page(cb->compressed_pages[faili]); - faili--; + if (memstall) + psi_memstall_leave(&pflags); + + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + return; + +fail: + if (cb->compressed_pages) { + for (i = 0; i < cb->nr_pages; i++) { + if (cb->compressed_pages[i]) + __free_page(cb->compressed_pages[i]); + } } kfree(cb->compressed_pages); -fail1: kfree(cb); out: free_extent_map(em); - return ret; -finish_cb: - if (comp_bio) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - /* All bytes of @cb is submitted, endio will free @cb */ - if (cur_disk_byte == disk_bytenr + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_bytenr + compressed_len - cur_disk_byte) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish @cb manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb, NULL); - return ret; + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; } /* @@ -1470,7 +1328,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, ASSERT(copy_start - decompressed < buf_len); memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + copy_start - decompressed, copy_len); - flush_dcache_page(bvec.bv_page); cur_offset += copy_len; bio_advance(orig_bio, copy_len); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 56eef0821e3e..1aa02903de69 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -22,14 +22,16 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ - refcount_t pending_sectors; + /* Number of outstanding bios */ + refcount_t pending_ios; /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -52,18 +54,17 @@ struct compressed_bio { /* The compression algorithm for this bio */ u8 compress_type; - /* IO errors */ - u8 errors; - int mirror_num; + /* Whether this is a write for writeback. */ + bool writeback; - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; + /* IO errors */ + blk_status_t status; - /* - * the start of a variable length array of checksums only - * used by reads - */ - u8 sums[]; + union { + /* For reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + struct work_struct write_end_work; + }; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -94,10 +95,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css); -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); + blk_opf_t write_flags, + struct cgroup_subsys_state *blkcg_css, + bool writeback); +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b..dcb510f38dda 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-checker.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -113,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +bool __cold abort_should_print_stack(int errno) +{ + switch (errno) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} + +/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -342,7 +359,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, int level = btrfs_header_level(buf); ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level, 0); + new_flags, level); if (ret) return ret; } @@ -846,9 +863,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - eb = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } return eb; @@ -1388,12 +1407,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, @@ -1407,12 +1427,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_key first_key; int ret; int parent_level; + bool unlock_up; + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) @@ -1434,47 +1463,68 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (p->nowait) { + free_extent_buffer(tmp); + return -EAGAIN; + } + + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); - if (!ret) { - *eb_ret = tmp; - return 0; + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; + } else if (p->nowait) { + return -EAGAIN; } - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); - if (!IS_ERR(tmp)) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!extent_buffer_uptodate(tmp)) - ret = -EIO; - free_extent_buffer(tmp); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); + } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + +out: + if (ret == 0) { + *eb_ret = tmp; } else { - ret = PTR_ERR(tmp); + free_extent_buffer(tmp); + btrfs_release_path(p); } - btrfs_release_path(p); return ret; } @@ -1607,7 +1657,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + if (p->nowait) { + b = btrfs_try_read_lock_root_node(root); + if (IS_ERR(b)) + return b; + } else { + b = btrfs_read_lock_root_node(root); + } level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -1883,6 +1939,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, WARN_ON(p->nodes[0] != NULL); BUG_ON(!cow && ins_len); + /* + * For now only allow nowait for read only operations. There's no + * strict reason why we can't, we just only need it for reads so it's + * only implemented for reads. + */ + ASSERT(!p->nowait || !cow); + if (ins_len < 0) { lowest_unlock = 2; @@ -1909,7 +1972,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (p->need_commit_sem) { ASSERT(p->search_commit_root); - down_read(&fs_info->commit_root_sem); + if (p->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) + return -EAGAIN; + } else { + down_read(&fs_info->commit_root_sem); + } } again: @@ -2048,11 +2116,22 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); + + btrfs_maybe_reset_lockdep_class(root, b); + if (level <= write_lock_level) { btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - btrfs_tree_read_lock(b); + if (p->nowait) { + if (!btrfs_try_tree_read_lock(b)) { + free_extent_buffer(b); + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(b); + } p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2101,6 +2180,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); + ASSERT(!p->nowait); if (p->search_commit_root) { BUG_ON(time_seq); @@ -2277,6 +2357,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } +/** + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + while (1) { + int ret; + const int slot = path->slots[0]; + const struct extent_buffer *leaf = path->nodes[0]; + + /* This is where we start walking the path. */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If we've reached the last slot in this leaf we need + * to go to the next leaf and reset the path. + */ + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + continue; + } + /* Store the found, valid item in @key. */ + btrfs_item_key_to_cpu(leaf, key, slot); + break; + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -2990,16 +3107,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; @@ -3224,7 +3336,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -3235,12 +3346,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(left); - if (free_space < data_size) { - ret = 1; - goto out; - } - if (check_sibling_keys(left, right)) { ret = -EUCLEAN; goto out; @@ -4170,24 +4275,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - u32 last_off; - u32 dsize = 0; int ret = 0; int wret; - int i; u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4227,24 +4330,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, fixup_low_keys(path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); - - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } @@ -4353,6 +4482,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int ret = 1; int keep_locks = path->keep_locks; + ASSERT(!path->nowait); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4533,6 +4663,13 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int i; + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); + nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; @@ -4551,7 +4688,14 @@ again: if (path->need_commit_sem) { path->need_commit_sem = 0; need_commit_sem = true; - down_read(&fs_info->commit_root_sem); + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } @@ -4627,7 +4771,7 @@ again: next = c; ret = read_block_for_search(root, path, &next, level, slot, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4637,6 +4781,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -4667,7 +4815,7 @@ again: ret = read_block_for_search(root, path, &next, level, 0, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4675,8 +4823,16 @@ again: goto done; } - if (!path->skip_locking) - btrfs_tree_read_lock(next); + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } } ret = 0; done: diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4a9b1c58d22..9e6d48ff4597 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -42,13 +42,18 @@ struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; +struct reloc_control; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -106,14 +111,6 @@ struct btrfs_bio; #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 -/* - * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size - */ -static inline u32 count_max_extents(u64 size) -{ - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -} - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -145,6 +142,11 @@ enum { BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT }; #define BTRFS_BACKREF_REV_MAX 256 @@ -224,6 +226,13 @@ struct btrfs_root_backup { #define BTRFS_SUPER_INFO_SIZE 4096 /* + * The reserved space at the beginning of each device. + * It covers the primary super block and leaves space for potential use by other + * tools like bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -242,8 +251,12 @@ struct btrfs_super_block { __le64 chunk_root; __le64 log_root; - /* this will help find the new super based on the log root */ - __le64 log_root_transid; + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -272,7 +285,8 @@ struct btrfs_super_block { u8 metadata_uuid[BTRFS_FSID_SIZE]; /* future expansion */ - __le64 reserved[28]; + u8 reserved8[8]; + __le64 reserved[27]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -292,11 +306,32 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY) + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -311,6 +346,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) +#endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -407,9 +443,10 @@ struct btrfs_path { * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; + /* Stop search if any locks need to be taken (for read) */ + unsigned int nowait:1; }; -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) + struct btrfs_dev_replace { u64 replace_state; /* see #define above */ time64_t time_started; /* seconds since 1-Jan-1970 */ @@ -466,22 +503,6 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; -enum btrfs_caching_type { - BTRFS_CACHE_NO, - BTRFS_CACHE_STARTED, - BTRFS_CACHE_FAST, - BTRFS_CACHE_FINISHED, - BTRFS_CACHE_ERROR, -}; - -/* - * Tree to record all locked full stripes of a RAID5/6 block group - */ -struct btrfs_full_stripe_locks_tree { - struct rb_root root; - struct mutex lock; -}; - /* Discard control. */ /* * Async discard uses multiple lists to differentiate the discard filter @@ -513,42 +534,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); - -/* fs_info */ -struct reloc_control; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; - -/* - * Block group or device which contains an active swapfile. Used for preventing - * unsafe operations while a swapfile is active. - * - * These are sorted on (ptr, inode) (note that a block group or device can - * contain more than one swapfile). We compare the pointer values because we - * don't actually care what the object is, we just need a quick check whether - * the object exists in the rbtree. - */ -struct btrfs_swapfile_pin { - struct rb_node node; - void *ptr; - struct inode *inode; - /* - * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr - * points to a struct btrfs_device. - */ - bool is_block_group; - /* - * Only used when 'is_block_group' is true and it is the number of - * extents used by a swapfile for this block group ('ptr' field). - */ - int bg_extent_count; -}; - -bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); - enum { BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, @@ -599,6 +584,12 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -620,6 +611,18 @@ enum btrfs_exclusive_operation { BTRFS_EXCLOP_SWAP_ACTIVATE, }; +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -630,6 +633,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -642,9 +646,8 @@ struct btrfs_fs_info { struct radix_tree_root fs_roots_radix; /* block group cache stuff */ - spinlock_t block_group_cache_lock; - u64 first_logical_byte; - struct rb_root block_group_cache_tree; + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; @@ -811,13 +814,14 @@ struct btrfs_fs_info { * two */ struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; - struct btrfs_workqueue *endio_workers; - struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *rmw_workers; - struct btrfs_workqueue *endio_meta_write_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -836,6 +840,7 @@ struct btrfs_fs_info { struct kobject *space_info_kobj; struct kobject *qgroups_kobj; + struct kobject *discard_kobj; /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; @@ -909,9 +914,9 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - struct btrfs_workqueue *scrub_workers; - struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_parity_workers; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -951,6 +956,7 @@ struct btrfs_fs_info { struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ + u8 qgroup_drop_subtree_thres; /* filesystem state */ unsigned long fs_state; @@ -995,6 +1001,12 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; @@ -1008,11 +1020,10 @@ struct btrfs_fs_info { * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ - union { - u64 zone_size; - u64 zoned; - }; + u64 zone_size; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1023,10 +1034,33 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1034,7 +1068,6 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; - struct kobject *discard_debug_kobj; struct list_head allocated_roots; spinlock_t eb_leak_lock; @@ -1042,12 +1075,85 @@ struct btrfs_fs_info { #endif }; +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } /* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + +/* * The state of btrfs root */ enum { @@ -1103,9 +1209,94 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, + /* This reloc root needs to have its buffers lockdep class reset. */ + BTRFS_ROOT_RESET_LOCKDEP_CLASS, +}; + +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, }; /* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + +/* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. */ @@ -1284,6 +1475,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -1596,25 +1789,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -1645,8 +1838,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } @@ -1654,8 +1847,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -2418,8 +2611,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, @@ -2669,37 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -/* - * Take the number of bytes to be checksummmed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes); @@ -2719,7 +2879,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict); + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2746,8 +2907,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level, int is_data); + struct extent_buffer *eb, u64 flags, int level); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, @@ -2826,7 +2986,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -2973,6 +3134,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -3124,16 +3314,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ -struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -3142,9 +3328,10 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig); + u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit); + struct list_head *list, int search_commit, + bool nowait); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, @@ -3158,16 +3345,21 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict); + u64 *ram_bytes, bool nowait, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); @@ -3189,23 +3381,39 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns); +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* + * Output from btrfs_new_inode_prepare(), input to + * btrfs_create_new_inode(). + */ + struct posix_acl *default_acl; + struct posix_acl *acl; +}; +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits); + u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, unsigned *bits); + struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); void btrfs_free_inode(struct inode *inode); @@ -3243,9 +3451,23 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + extern const struct dentry_operations btrfs_dentry_operations; -extern const struct iomap_ops btrfs_dio_iomap_ops; -extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) @@ -3257,6 +3479,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3274,26 +3497,15 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - /* file.c */ int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned); extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -3305,14 +3517,18 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes); + size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3330,11 +3546,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + __printf(2, 3) __cold -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + #else + #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -3560,21 +3794,26 @@ const char * __attribute_const__ btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno); + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. + * detected, that way the exact stack trace is reported for some errors. */ #define btrfs_abort_transaction(trans, errno) \ do { \ + bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO && (errno) != -EROFS) { \ - WARN(1, KERN_DEBUG \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ + (errno))) { \ + /* Stack trace printed. */ \ } else { \ btrfs_debug((trans)->fs_info, \ "Transaction aborted (error %d)", \ @@ -3582,17 +3821,33 @@ do { \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno)); \ + __LINE__, (errno), first); \ } while (0) +#ifdef CONFIG_PRINTK_INDEX + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) __printf(5, 6) __cold @@ -3740,15 +3995,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) { - return 0; + return -EOPNOTSUPP; } #endif @@ -3758,7 +4014,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3782,16 +4038,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); -static inline void btrfs_init_full_stripe_locks_tree( - struct btrfs_full_stripe_locks_tree *locks_root) -{ - locks_root->root = RB_ROOT; - mutex_init(&locks_root->lock); -} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) @@ -3818,6 +4067,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); @@ -3835,6 +4085,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) return 0; } +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + #endif /* Sanity test specific functions */ @@ -3851,11 +4107,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zoned != 0; -} - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; @@ -3870,5 +4121,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fb46a28f5065..118b2e20b2e1 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) } int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) + struct extent_changeset **reserved, u64 start, + u64 len, bool noflush) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; int ret; /* align the range */ @@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (noflush) + flush = BTRFS_RESERVE_NO_FLUSH; + else if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; + + ret = btrfs_reserve_data_bytes(fs_info, len, flush); if (ret < 0) return ret; @@ -270,11 +277,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, } static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) { - u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 nr_extents = count_max_extents(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, @@ -288,7 +295,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, *qgroup_reserve = nr_extents * fs_info->nodesize; } -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -307,7 +315,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ - if (btrfs_is_free_space_inode(inode)) { + if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) @@ -318,6 +326,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends @@ -329,9 +338,10 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); @@ -347,9 +357,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * needs to free the reservation we just made. */ spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); + nr_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -410,7 +420,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) unsigned num_extents; spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); + num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -451,10 +461,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, { int ret; - ret = btrfs_check_data_free_space(inode, reserved, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len, false); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 28bf5c3ef430..e07d46043455 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -7,7 +7,8 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); + struct extent_changeset **reserved, u64 start, u64 len, + bool noflush); void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d860..cac5169eaf8d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->p_list); } -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -314,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node( __btrfs_release_delayed_node(node, 1); } -static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, + struct btrfs_delayed_node *node, + enum btrfs_delayed_item_type type) { struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); if (item) { item->data_len = data_len; - item->ins_or_del = 0; + item->type = type; item->bytes_reserved = 0; - item->delayed_node = NULL; + item->delayed_node = node; + RB_CLEAR_NODE(&item->rb_node); + INIT_LIST_HEAD(&item->log_list); + item->logged = false; refcount_set(&item->refs, 1); } return item; @@ -331,89 +325,46 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) /* * __btrfs_lookup_delayed_item - look up the delayed item by key * @delayed_node: pointer to the delayed node - * @key: the key to look up - * @prev: used to store the prev item if the right item isn't found - * @next: used to store the next item if the right item isn't found + * @index: the dir index value to lookup (offset of a dir index key) * * Note: if we don't find the right item, we will return the prev item and * the next item. */ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, - struct btrfs_key *key, - struct btrfs_delayed_item **prev, - struct btrfs_delayed_item **next) + u64 index) { - struct rb_node *node, *prev_node = NULL; + struct rb_node *node = root->rb_node; struct btrfs_delayed_item *delayed_item = NULL; - int ret = 0; - - node = root->rb_node; while (node) { delayed_item = rb_entry(node, struct btrfs_delayed_item, rb_node); - prev_node = node; - ret = btrfs_comp_cpu_keys(&delayed_item->key, key); - if (ret < 0) + if (delayed_item->index < index) node = node->rb_right; - else if (ret > 0) + else if (delayed_item->index > index) node = node->rb_left; else return delayed_item; } - if (prev) { - if (!prev_node) - *prev = NULL; - else if (ret < 0) - *prev = delayed_item; - else if ((node = rb_prev(prev_node)) != NULL) { - *prev = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *prev = NULL; - } - - if (next) { - if (!prev_node) - *next = NULL; - else if (ret > 0) - *next = delayed_item; - else if ((node = rb_next(prev_node)) != NULL) { - *next = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *next = NULL; - } return NULL; } -static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, - NULL, NULL); -} - static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) + struct btrfs_delayed_item *ins) { struct rb_node **p, *node; struct rb_node *parent_node = NULL; struct rb_root_cached *root; struct btrfs_delayed_item *item; - int cmp; bool leftmost = true; - if (action == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) - root = &delayed_node->del_root; else - BUG(); + root = &delayed_node->del_root; + p = &root->rb_root.rb_node; node = &ins->rb_node; @@ -422,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, item = rb_entry(parent_node, struct btrfs_delayed_item, rb_node); - cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) { + if (item->index < ins->index) { p = &(*p)->rb_right; leftmost = false; - } else if (cmp > 0) { + } else if (item->index > ins->index) { p = &(*p)->rb_left; } else { return -EEXIST; @@ -435,33 +385,16 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); - ins->delayed_node = delayed_node; - ins->ins_or_del = action; - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && - ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && + ins->index >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->index + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); return 0; } -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); @@ -477,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; - /* Not associated with any delayed_node */ - if (!delayed_item->delayed_node) + /* Not inserted, ignore it. */ + if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); - BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && - delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); - if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_item->delayed_node->ins_root; else root = &delayed_item->delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); + RB_CLEAR_NODE(&delayed_item->rb_node); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -546,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 num_bytes; int ret; @@ -571,9 +503,15 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, + item->delayed_node->inode_id, num_bytes, 1); - item->bytes_reserved = num_bytes; + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->type == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; } return ret; @@ -594,11 +532,26 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, * to release/reserve qgroup space. */ trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, item->bytes_reserved, - 0); + item->delayed_node->inode_id, + item->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -672,36 +625,78 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * Insert a single delayed item or a batch of delayed items that have consecutive - * keys if they exist. + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. */ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; - const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; + struct btrfs_key first_key; + const u32 first_data_size = first_item->data_len; int total_size; char *ins_data = NULL; int ret; + bool continuous_keys_only = false; + + lockdep_assert_held(&node->mutex); + + /* + * During normal operation the delayed index offset is continuously + * increasing, so we can batch insert all items as there will not be any + * overlapping keys in the tree. + * + * The exception to this is log replay, where we may have interleaved + * offsets in the tree, so our batch needs to be continuous keys only in + * order to ensure we do not end up with out of order items in our leaf. + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + continuous_keys_only = true; + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); - batch.total_data_size = first_item->data_len; + batch.total_data_size = first_data_size; batch.nr = 1; - total_size = first_item->data_len + sizeof(struct btrfs_item); + total_size = first_data_size + sizeof(struct btrfs_item); curr = first_item; while (true) { int next_size; next = __btrfs_next_delayed_item(curr); - if (!next || !btrfs_is_continuous_delayed_item(curr, next)) + if (!next) break; + /* + * We cannot allow gaps in the key space if we're doing log + * replay. + */ + if (continuous_keys_only && (next->index != curr->index + 1)) + break; + + ASSERT(next->bytes_reserved == 0); + next_size = next->data_len + sizeof(struct btrfs_item); if (total_size + next_size > max_size) break; @@ -714,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, } if (batch.nr == 1) { - batch.keys = &first_item->key; - batch.data_sizes = &first_item->data_len; + first_key.objectid = node->inode_id; + first_key.type = BTRFS_DIR_INDEX_KEY; + first_key.offset = first_item->index; + batch.keys = &first_key; + batch.data_sizes = &first_data_size; } else { struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -732,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; list_for_each_entry(curr, &item_list, tree_list) { - ins_keys[i] = curr->key; + ins_keys[i].objectid = node->inode_id; + ins_keys[i].type = BTRFS_DIR_INDEX_KEY; + ins_keys[i].offset = curr->index; ins_sizes[i] = curr->data_len; i++; } @@ -758,9 +758,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); + ASSERT(node->index_item_leaves > 0); + + /* + * For normal operations we will batch an entire leaf's worth of delayed + * items, so if there are more items to process we can decrement + * index_item_leaves by 1 as we inserted 1 leaf's worth of items. + * + * However for log replay we may not have inserted an entire leaf's + * worth of items, we may have not had continuous items, so decrementing + * here would mess up the index_item_leaves accounting. For this case + * only clean up the accounting when there are no items left. + */ + if (next && !continuous_keys_only) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else if (!next) { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); - btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } out: @@ -796,62 +828,77 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + const u64 ino = item->delayed_node->inode_id; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; + struct extent_buffer *leaf = path->nodes[0]; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; + u64 total_reserved_size = item->bytes_reserved; - BUG_ON(!path->nodes[0]); + ASSERT(leaf != NULL); - leaf = path->nodes[0]; + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) + return -ENOENT; - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; /* - * count the number of the dir index items that we can delete in batch + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; + while (slot < last_slot) { + struct btrfs_key key; - curr = next; next = __btrfs_next_delayed_item(curr); if (!next) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - - i++; - if (i > last_item) + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) break; - btrfs_item_key_to_cpu(leaf, &key, i); + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; } - if (!nitems) - return 0; - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) - goto out; + return ret; - list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", ino, + total_reserved_size, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } + + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - return ret; + return 0; } static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, @@ -859,43 +906,57 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; + struct btrfs_key key; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; + key.objectid = node->inode_id; + key.type = BTRFS_DIR_INDEX_KEY; + + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + key.offset = item->index; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) { - mutex_unlock(&node->mutex); - goto do_again; - } else - goto delete_fail; + mutex_unlock(&node->mutex); } - btrfs_batch_delete_items(trans, root, path, curr); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); return ret; } @@ -1354,24 +1415,28 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_disk_key *disk_key, u8 type, u64 index) { + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; int ret; delayed_node = btrfs_get_or_create_delayed_node(dir); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); - delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, + delayed_node, + BTRFS_DELAYED_INSERTION_ITEM); if (!delayed_item) { ret = -ENOMEM; goto release_node; } - delayed_item->key.objectid = btrfs_ino(dir); - delayed_item->key.type = BTRFS_DIR_INDEX_KEY; - delayed_item->key.offset = index; + delayed_item->index = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1381,15 +1446,51 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + data_len = delayed_item->data_len + sizeof(struct btrfs_item); mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_item(delayed_item); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; + } + + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1406,19 +1507,48 @@ release_node: static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, struct btrfs_delayed_node *node, - struct btrfs_key *key) + u64 index) { struct btrfs_delayed_item *item; mutex_lock(&node->mutex); - item = __btrfs_lookup_delayed_insertion_item(node, key); + item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); return 1; } - btrfs_delayed_item_release_metadata(node->root, item); + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + mutex_unlock(&node->mutex); return 0; } @@ -1428,31 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; - struct btrfs_key item_key; int ret; node = btrfs_get_or_create_delayed_node(dir); if (IS_ERR(node)) return PTR_ERR(node); - item_key.objectid = btrfs_ino(dir); - item_key.type = BTRFS_DIR_INDEX_KEY; - item_key.offset = index; - - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, - &item_key); + ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); if (!ret) goto end; - item = btrfs_alloc_delayed_item(0); + item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { ret = -ENOMEM; goto end; } - item->key = item_key; + item->index = index; - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); + ret = btrfs_delayed_item_reserve_metadata(trans, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1465,7 +1589,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); + ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1581,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int ret = 0; list_for_each_entry(curr, del_list, readdir_list) { - if (curr->key.offset > index) + if (curr->index > index) break; - if (curr->key.offset == index) { + if (curr->index == index) { ret = 1; break; } @@ -1617,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < ctx->pos) { + if (curr->index < ctx->pos) { if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } - ctx->pos = curr->key.offset; + ctx->pos = curr->index; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); @@ -1833,12 +1957,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); } + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { btrfs_delayed_item_release_metadata(root, curr_item); @@ -1918,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) } } +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_insertion_item(node); + while (item) { + /* + * It's possible that the item is already in a log list. This + * can happen in case two tasks are trying to log the same + * directory. For example if we have tasks A and task B: + * + * Task A collected the delayed items into a log list while + * under the inode's log_mutex (at btrfs_log_inode()), but it + * only releases the items after logging the inodes they point + * to (if they are new inodes), which happens after unlocking + * the log mutex; + * + * Task B enters btrfs_log_inode() and acquires the log_mutex + * of the same directory inode, before task B releases the + * delayed items. This can happen for example when logging some + * inode we need to trigger logging of its parent directory, so + * logging two files that have the same parent directory can + * lead to this. + * + * If this happens, just ignore delayed items already in a log + * list. All the tasks logging the directory are under a log + * transaction and whichever finishes first can not sync the log + * before the other completes and leaves the log transaction. + */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, ins_list); + } + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(node); + while (item) { + /* It may be non-empty, for the same reason mentioned above. */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, del_list); + } + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} + +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_delayed_item *next; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + + list_for_each_entry_safe(item, next, ins_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + list_for_each_entry_safe(item, next, del_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b2412160c5bc..0163ca637a96 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,9 +16,10 @@ #include <linux/refcount.h> #include "ctree.h" -/* types of the delayed item */ -#define BTRFS_DELAYED_INSERTION_ITEM 1 -#define BTRFS_DELAYED_DELETION_ITEM 2 +enum btrfs_delayed_item_type { + BTRFS_DELAYED_INSERTION_ITEM, + BTRFS_DELAYED_DELETION_ITEM +}; struct btrfs_delayed_root { spinlock_t lock; @@ -58,18 +59,42 @@ struct btrfs_delayed_node { u64 index_cnt; unsigned long flags; int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; }; struct btrfs_delayed_item { struct rb_node rb_node; - struct btrfs_key key; + /* Offset value of the corresponding dir index key. */ + u64 index; struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ + /* + * Used when logging a directory. + * Insertions and deletions to this list are protected by the parent + * delayed node's mutex. + */ + struct list_head log_list; u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; refcount_t refs; - int ins_or_del; - u32 data_len; + enum btrfs_delayed_item_type type:8; + /* + * Track if this delayed item was already logged. + * Protected by the mutex of the parent delayed inode. + */ + bool logged; + /* The maximum leaf size is 64K, so u16 is more than enough. */ + u16 data_len; char data[]; }; @@ -133,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); +/* Used during directory logging. */ +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); + /* for init */ int __init btrfs_delayed_inode_init(void); void __cold btrfs_delayed_inode_exit(void); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 4176df149d04..36a3debe9493 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; + delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; } @@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, if (num_bytes) delayed_refs_rsv->reserved += num_bytes; if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; + delayed_refs_rsv->full = true; spin_unlock(&delayed_refs_rsv->lock); if (num_bytes) @@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, - false); + BTRFS_UPDATE_DELAYED_HEAD, false, false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 91a3aabad150..d6304b690ec4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op { u8 level; bool update_key; bool update_flags; - bool is_data; u64 flags_to_set; }; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62b9651ea662..61e58066b5fd 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found: */ if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, - "replace devid present without an active replace item"); +"replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; } else { dev_replace->srcdev = NULL; @@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; struct rcu_string *name; @@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } rcu_assign_pointer(device->name, name); + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; @@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); *device_out = device; return 0; @@ -470,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_block_group *cache; struct btrfs_trans_handle *trans; + int iter_ret = 0; int ret = 0; u64 chunk_offset; @@ -520,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto free_path; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto free_path; - if (ret > 0) { - ret = 0; - goto free_path; - } - } else { - ret = 0; - } - } - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != src_dev->devid) break; @@ -553,30 +537,20 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (found_key.offset < key.offset) break; - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) - goto skip; - - spin_lock(&cache->lock); - cache->to_copy = 1; - spin_unlock(&cache->lock); + continue; + set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); - -skip: - ret = btrfs_next_item(root, path); - if (ret != 0) { - if (ret > 0) - ret = 0; - break; - } } + if (iter_ret < 0) + ret = iter_ret; -free_path: btrfs_free_path(path); unlock: mutex_unlock(&fs_info->chunk_mutex); @@ -600,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, return true; spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); return true; } @@ -610,7 +584,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, ASSERT(!IS_ERR(em)); map = em->map_lookup; - num_extents = cur_extent = 0; + num_extents = 0; + cur_extent = 0; for (i = 0; i < map->num_stripes; i++) { /* We have more device extent to copy */ if (srcdev != map->stripes[i].dev) @@ -632,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } /* Last stripe on this device */ - spin_lock(&cache->lock); - cache->to_copy = 0; - spin_unlock(&cache->lock); + clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); return true; } @@ -730,7 +703,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -872,6 +850,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *tgt_device; struct btrfs_device *src_device; struct btrfs_root *root = fs_info->tree_root; @@ -921,12 +900,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* Prevent write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); /* Prevent new chunks being allocated on the source device */ mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&src_device->post_commit_list)) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { break; @@ -963,7 +942,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); @@ -992,8 +971,8 @@ error: btrfs_assign_next_active_device(src_device, tgt_device); - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->rw_devices++; + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1016,7 +995,7 @@ error: * belong to this filesystem. */ mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* replace the sysfs entry */ btrfs_sysfs_remove_device(src_device); @@ -1145,8 +1124,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) up_write(&dev_replace->rwsem); /* Scrub for replace must not be running in suspended state */ - ret = btrfs_scrub_cancel(fs_info); - ASSERT(ret != -ENOTCONN); + btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -1305,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) -{ - percpu_counter_inc(&fs_info->dev_replace.bio_counter); -} - void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 3911049a5f23..6084b313056a 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -7,6 +7,10 @@ #define BTRFS_DEV_REPLACE_H struct btrfs_ioctl_dev_replace_args; +struct btrfs_fs_info; +struct btrfs_trans_handle; +struct btrfs_dev_replace; +struct btrfs_block_group; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 3b532bab0755..72fb2c518a2b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len) { - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - u32 nritems; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; @@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root, name, name_len); if (di) return di; - - path->slots[0]++; } - return NULL; + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..d99bf7c64611 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -51,7 +51,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -64,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -/* - * btrfs_end_io_wq structs are used to do processing in task context when an IO - * is complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct btrfs_end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - blk_status_t status; - enum btrfs_wq_endio_type metadata; - struct btrfs_work work; -}; - -static struct kmem_cache *btrfs_end_io_wq_cache; - -int __init btrfs_end_io_wq_init(void) -{ - btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", - sizeof(struct btrfs_end_io_wq), - 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_end_io_wq_cache) - return -ENOMEM; - return 0; -} - -void __cold btrfs_end_io_wq_exit(void) -{ - kmem_cache_destroy(btrfs_end_io_wq_cache); -} - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) @@ -122,88 +87,6 @@ struct async_submit_bio { }; /* - * Lockdep class keys for extent_buffer->lock's in this root. For a given - * eb, the lockdep key is determined by the btrfs_root it belongs to and - * the level the eb occupies in the tree. - * - * Different roots are used for different purposes and may nest inside each - * other and they require separate keysets. As lockdep keys should be - * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->root_key.objectid. This ensures that all special purpose - * roots have separate keysets. - * - * Lock-nesting across peer nodes is always done with the immediate parent - * node locked thus preventing deadlock. As lockdep doesn't know this, use - * subclass to avoid triggering lockdep warning in such cases. - * - * The key is set by the readpage_end_io_hook after the buffer has passed - * csum validation but before the pages are unlocked. It is also set by - * btrfs_init_new_buffer on freshly allocated blocks. - * - * We also add a check to make sure the highest level of the tree is the - * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code - * needs update as well. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# if BTRFS_MAX_LEVEL != 8 -# error -# endif - -#define DEFINE_LEVEL(stem, level) \ - .names[level] = "btrfs-" stem "-0" #level, - -#define DEFINE_NAME(stem) \ - DEFINE_LEVEL(stem, 0) \ - DEFINE_LEVEL(stem, 1) \ - DEFINE_LEVEL(stem, 2) \ - DEFINE_LEVEL(stem, 3) \ - DEFINE_LEVEL(stem, 4) \ - DEFINE_LEVEL(stem, 5) \ - DEFINE_LEVEL(stem, 6) \ - DEFINE_LEVEL(stem, 7) - -static struct btrfs_lockdep_keyset { - u64 id; /* root objectid */ - /* Longest entry: btrfs-free-space-00 */ - char names[BTRFS_MAX_LEVEL][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL]; -} btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, - { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, - { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, - { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, - { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, - { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, - { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, - { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, - { .id = 0, DEFINE_NAME("tree") }, -}; - -#undef DEFINE_LEVEL -#undef DEFINE_NAME - -void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, - int level) -{ - struct btrfs_lockdep_keyset *ks; - - BUG_ON(level >= ARRAY_SIZE(ks->keys)); - - /* find the matching keyset, id 0 is the default entry */ - for (ks = btrfs_lockdep_keysets; ks->id; ks++) - if (ks->id == objectid) - break; - - lockdep_set_class_and_name(&eb->lock, - &ks->keys[level], ks->names[level]); -} - -#endif - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -248,22 +131,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; } btrfs_err_rl(eb->fs_info, - "parent transid verify failed on %llu wanted %llu found %llu", - eb->start, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); out: - unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -284,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, - char *raw_disk_sb) +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) { - struct btrfs_super_block *disk_sb = - (struct btrfs_super_block *)raw_disk_sb; char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -299,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) @@ -374,9 +254,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) +int btrfs_read_extent_buffer(struct extent_buffer *eb, + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -441,17 +321,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ @@ -505,7 +399,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec u64 found_start; struct extent_buffer *eb; - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; @@ -573,21 +467,23 @@ static int validate_extent_buffer(struct extent_buffer *eb) found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", - eb->start, found_start); + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { - btrfs_err_rl(fs_info, "bad fsid on block %llu", - eb->start); + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d on %llu", - (int)btrfs_header_level(eb), eb->start); + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } @@ -598,8 +494,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - eb->start, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); @@ -624,8 +520,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) set_extent_buffer_uptodate(eb); else btrfs_err(fs_info, - "block=%llu read time tree block corruption detected", - eb->start); + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); out: return ret; } @@ -690,7 +586,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return validate_subpage_buffer(page, start, end, mirror); eb = (struct extent_buffer *)page->private; @@ -726,58 +622,6 @@ err: return ret; } -static void end_workqueue_bio(struct bio *bio) -{ - struct btrfs_end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - struct btrfs_workqueue *wq; - - fs_info = end_io_wq->info; - end_io_wq->status = bio->bi_status; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - wq = fs_info->endio_meta_write_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else - wq = fs_info->endio_write_workers; - } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - } - - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); - btrfs_queue_work(wq, &end_io_wq->work); -} - -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata) -{ - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); - if (!end_io_wq) - return BLK_STS_RESOURCE; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->status = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -800,17 +644,14 @@ static void run_one_async_start(struct btrfs_work *work) */ static void run_one_async_done(struct btrfs_work *work) { - struct async_submit_bio *async; - struct inode *inode; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - inode = async->inode; + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct inode *inode = async->inode; + struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { - async->bio->bi_status = async->status; - bio_endio(async->bio); + btrfs_bio_end_io(bbio, async->status); return; } @@ -820,11 +661,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); - if (ret) { - async->bio->bi_status = ret; - bio_endio(async->bio); - } + btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -835,17 +672,23 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +/* + * Submit bio to an async queue. + * + * Retrun: + * - true if the work has been succesfuly submitted + * - false in case of error + */ +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return BLK_STS_RESOURCE; + return false; async->inode = inode; async->bio = bio; @@ -860,10 +703,10 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, async->status = 0; if (op_is_sync(bio->bi_opf)) - btrfs_set_work_high_priority(&async->work); - - btrfs_queue_work(fs_info->workers, &async->work); - return 0; + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; } static blk_status_t btree_csum_one_bio(struct bio *bio) @@ -889,7 +732,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, { /* * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio + * submission context. Just jump into btrfs_submit_bio. */ return btree_csum_one_bio(bio); } @@ -906,69 +749,59 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; + bio->bi_opf |= REQ_META; + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_METADATA); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!should_async_write(fs_info, BTRFS_I(inode))) { - ret = btree_csum_one_bio(bio); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else { - /* - * kthread helpers are used to submit writes so that - * checksumming can happen in parallel across all CPUs - */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - 0, btree_submit_bio_start); + btrfs_submit_bio(fs_info, bio, mirror_num); + return; } - if (ret) - goto out_w_error; - return 0; + /* + * Kthread helpers are used to submit writes so that checksumming can + * happen in parallel across all CPUs. + */ + if (should_async_write(fs_info, BTRFS_I(inode)) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + return; -out_w_error: - bio->bi_status = ret; - bio_endio(bio); - return ret; + ret = btree_csum_one_bio(bio); + if (ret) { + btrfs_bio_end_io(bbio, ret); + return; + } + + btrfs_submit_bio(fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ - if (PageDirty(page)) + if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } +#else +#define btree_migrate_folio NULL #endif - static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -991,49 +824,48 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_releasepage(struct page *page, gfp_t gfp_flags) +static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; - return try_release_extent_buffer(page); + return try_release_extent_buffer(&folio->page); } -static void btree_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, - "page private not zero on page %llu", - (unsigned long long)page_offset(page)); - detach_page_private(page); + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_release_folio(folio, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); } } -static int btree_set_page_dirty(struct page *page) -{ #ifdef DEBUG - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; - u64 page_start = page_offset(page); + u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(subpage->dirty_bitmap); while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { @@ -1059,18 +891,18 @@ static int btree_set_page_dirty(struct page *page) cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); } -#endif - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } +#else +#define btree_dirty_folio filemap_dirty_folio +#endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, - .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .set_page_dirty = btree_set_page_dirty, + .release_folio = btree_release_folio, + .invalidate_folio = btree_invalidate_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1103,12 +935,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } + if (btrfs_check_eb_owner(buf, owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } return buf; } @@ -1289,12 +1124,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, return root; } +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1305,7 +1161,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1522,10 +1378,28 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = PTR_ERR(root->node); root->node = NULL; goto fail; - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } root->commit_root = btrfs_root_node(root); return root; fail: @@ -1645,6 +1519,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) + return btrfs_grab_root(fs_info->block_group_root) ? + fs_info->block_group_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { struct btrfs_root *root = btrfs_global_root(fs_info, &key); @@ -1727,6 +1604,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1812,16 +1690,17 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; fail: /* * If our caller provided us an anonymous device, then it's his - * responsability to free it in case we fail. So we have to set our + * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ @@ -1904,29 +1783,9 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, return root; } -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = container_of(work, struct btrfs_end_io_wq, work); - bio = end_io_wq->bio; - - bio->bi_status = end_io_wq->status; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - bio_endio(bio); - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); -} - static int cleaner_kthread(void *arg) { - struct btrfs_root *root = arg; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = arg; int again; while (1) { @@ -1959,7 +1818,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(fs_info); - again = btrfs_clean_one_deleted_snapshot(root); + again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* @@ -2095,8 +1954,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2121,11 +1978,23 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(extent_root->node)); + if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } /* * we might commit during log recovery, which happens before we set @@ -2146,12 +2015,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(csum_root->node)); - btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, @@ -2217,10 +2080,16 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); - btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->rmw_workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2234,8 +2103,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ - btrfs_destroy_workqueue(fs_info->endio_meta_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2269,6 +2138,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2346,6 +2216,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) { struct inode *inode = fs_info->btree_inode; + unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, + fs_info->tree_root); inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2359,14 +2231,15 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, inode); - BTRFS_I(inode)->io_tree.track_uptodate = false; + IO_TREE_BTREE_INODE_IO, NULL); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - btrfs_insert_inode_hash(inode); + __insert_inode_hash(inode, hash); } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2385,6 +2258,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2394,7 +2268,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue(fs_info, "worker", + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + fs_info->hipri_workers = + btrfs_alloc_workqueue(fs_info, "worker-high", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = @@ -2411,26 +2287,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); - /* - * endios are largely parallel and should have a very - * low idle thresh - */ fs_info->endio_workers = - btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); + alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta", flags, - max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, - max_active, 2); + alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); - fs_info->rmw_workers = - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2442,10 +2310,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->flush_workers && + if (!(fs_info->workers && fs_info->hipri_workers && + fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && + fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -2472,6 +2340,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } @@ -2504,11 +2375,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; - } else if (!extent_buffer_uptodate(log_tree_root->node)) { + } + if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2533,6 +2406,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; + u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, @@ -2568,6 +2442,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, break; btrfs_release_path(path); + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { @@ -2585,6 +2466,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, } btrfs_release_path(path); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); @@ -2638,10 +2522,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) return ret; - location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->block_group_root = root; + } + } + + location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { @@ -2653,7 +2551,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ - btrfs_init_devices_late(fs_info); + ret = btrfs_init_devices_late(fs_info); + if (ret) + goto out; /* * This tree can share blocks with some other fs tree during relocation @@ -2709,8 +2609,8 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -static int validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); @@ -2752,12 +2652,14 @@ static int validate_super(struct btrfs_fs_info *fs_info, } /* - * For 4K page size, we only support 4K sector size. - * For 64K page size, we support 64K and 4K sector sizes. + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. */ - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && - sectorsize != SZ_64K))) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2810,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + /* + * Artificial requirement for block-group-tree to force newer features + * (free-space-tree, no-holes) so the test matrix is smaller. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES))) { + btrfs_err(fs_info, + "block-group-tree feature requires fres-space-tree and no-holes"); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -2892,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { - return validate_super(fs_info, fs_info->super_copy, 0); + return btrfs_validate_super(fs_info, fs_info->super_copy, 0); } /* @@ -2906,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, { int ret; - ret = validate_super(fs_info, sb, -1); + ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { @@ -2930,6 +2844,46 @@ out: return ret; } +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, + root->root_key.objectid, gen, level, NULL); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + return 0; +} + static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); @@ -2940,9 +2894,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - u64 generation; - int level; - if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); @@ -2967,29 +2918,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; } - generation = btrfs_super_generation(sb); - level = btrfs_super_root_level(sb); - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), - BTRFS_ROOT_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(tree_root->node)) { - handle_error = true; - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - btrfs_warn(fs_info, "couldn't read tree root"); - continue; - } else if (!extent_buffer_uptodate(tree_root->node)) { + ret = load_important_roots(fs_info); + if (ret) { handle_error = true; - ret = -EIO; - btrfs_warn(fs_info, "error while reading tree root"); continue; } - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user @@ -3009,8 +2944,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* All successful */ - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -3053,8 +2988,22 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); + btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, + BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, + BTRFS_LOCKDEP_TRANS_UNBLOCKED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed, + BTRFS_LOCKDEP_TRANS_COMPLETED); + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -3104,9 +3053,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - fs_info->first_logical_byte = (u64)-1; + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); @@ -3141,6 +3089,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; @@ -3190,7 +3140,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block static int btrfs_uuid_rescan_kthread(void *data) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + struct btrfs_fs_info *fs_info = data; int ret; /* @@ -3293,7 +3243,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(fs_info->tree_root); + ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3343,6 +3293,112 @@ out: return ret; } +/* + * Do various sanity and dependency checks of different features. + * + * This is the place for less strict checks (like for subpage or artificial + * feature dependencies). + * + * For strict checks or possible corruption detection, see + * btrfs_validate_super(). + * + * This should be called after btrfs_parse_options(), as some mount options + * (space cache related) can modify on-disk format like free space tree and + * screw up certain feature dependencies. + */ +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 incompat = btrfs_super_incompat_flags(disk_super); + const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super); + const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP); + + if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + btrfs_err(fs_info, + "cannot mount because of unknown incompat features (0x%llx)", + incompat); + return -EINVAL; + } + + /* Runtime limitation for mixed block groups. */ + if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (fs_info->sectorsize != fs_info->nodesize)) { + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + fs_info->nodesize, fs_info->sectorsize); + return -EINVAL; + } + + /* Mixed backref is an always-enabled feature. */ + incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + + /* Set compression related flags just in case. */ + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; + + /* + * An ancient flag, which should really be marked deprecated. + * Such runtime limitation doesn't really need a incompat flag. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + + if (compat_ro_unsupp && !sb_rdonly(sb)) { + btrfs_err(fs_info, + "cannot mount read-write because of unknown compat_ro features (0x%llx)", + compat_ro); + return -EINVAL; + } + + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata writes, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (compat_ro_unsupp && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + compat_ro); + return -EINVAL; + } + + /* + * Artificial limitations for block group tree, to force + * block-group-tree to rely on no-holes and free-space-tree. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) { + btrfs_err(fs_info, +"block-group-tree feature requires no-holes and free-space-tree features"); + return -EINVAL; + } + + /* + * Subpage runtime limitation on v1 cache. + * + * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * we're already defaulting to v2 cache, no need to bother v1 as it's + * going to be deprecated anyway. + */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } + + /* This can be called by remount, we need to protect the super block. */ + spin_lock(&fs_info->super_lock); + btrfs_set_super_incompat_flags(disk_super, incompat); + spin_unlock(&fs_info->super_lock); + + return 0; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -3423,7 +3479,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { + if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; btrfs_release_disk_super(disk_super); @@ -3472,16 +3528,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); @@ -3502,68 +3548,29 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - features = btrfs_super_incompat_flags(disk_super) & - ~BTRFS_FEATURE_INCOMPAT_SUPP; - if (features) { - btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", - features); - err = -EINVAL; - goto fail_alloc; - } - - features = btrfs_super_incompat_flags(disk_super); - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (fs_info->compress_type == BTRFS_COMPRESS_LZO) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - - if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - btrfs_info(fs_info, "has skinny extents"); - - /* - * mixed block groups end up with duplicate but slightly offset - * extent buffers for the same range. It leads to corruptions - */ - if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != nodesize)) { - btrfs_err(fs_info, -"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", - nodesize, sectorsize); - goto fail_alloc; - } - - /* - * Needn't use the lock because there is no other task which will - * update the flag. - */ - btrfs_set_super_incompat_flags(disk_super, features); - - features = btrfs_super_compat_ro_flags(disk_super) & - ~BTRFS_FEATURE_COMPAT_RO_SUPP; - if (!sb_rdonly(sb) && features) { - btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", - features); - err = -EINVAL; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) { + err = ret; goto fail_alloc; } if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - if (btrfs_super_incompat_flags(fs_info->super_copy) & - BTRFS_FEATURE_INCOMPAT_RAID56) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - err = -EINVAL; - goto fail_alloc; - } subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); if (!subpage_info) goto fail_alloc; @@ -3594,21 +3601,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - - chunk_root->node = read_tree_block(fs_info, - btrfs_super_chunk_root(disk_super), - BTRFS_CHUNK_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(chunk_root->node) || - !extent_buffer_uptodate(chunk_root->node)) { + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { btrfs_err(fs_info, "failed to read chunk root"); - if (!IS_ERR(chunk_root->node)) - free_extent_buffer(chunk_root->node); - chunk_root->node = NULL; goto fail_tree_roots; } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), @@ -3728,7 +3726,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; @@ -3813,6 +3811,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -3891,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio) } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num) + int copy_num, bool drop_cache) { struct btrfs_super_block *super; struct page *page; @@ -3909,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); + + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, + bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); @@ -3940,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i); + super = btrfs_read_dev_one_super(bdev, i, false); if (IS_ERR(super)) continue; @@ -4029,8 +4044,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4042,11 +4058,11 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); if (btrfs_advance_sb_log(device, i)) errors++; @@ -4127,6 +4143,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4136,7 +4153,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4149,19 +4166,18 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; - bio_set_dev(bio, device->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4170,7 +4186,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; @@ -4530,6 +4546,28 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4560,12 +4598,35 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4747,13 +4808,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key) -{ - return btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e8bef4b7563..9fa923e005a3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,13 +17,6 @@ */ #define BTRFS_BDEV_BLOCKSIZE (4096) -enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA, - BTRFS_WQ_ENDIO_METADATA, - BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, -}; - static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; @@ -49,14 +42,19 @@ struct extent_buffer *btrfs_find_create_tree_block( void btrfs_clean_tree_block(struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num); +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num); + int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); @@ -87,8 +85,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -111,6 +108,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } @@ -118,14 +117,11 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key); -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); +int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, + int level, struct btrfs_key *first_key); +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, @@ -145,17 +141,5 @@ int btree_lock_page_hook(struct page *page, void *data, int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -int __init btrfs_end_io_wq_init(void); -void __cold btrfs_end_io_wq_exit(void); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level); -#else -static inline void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level) -{ -} -#endif #endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1d4c2397d0d6..fab7eb76e53b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..5afb7ca42828 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,7 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation); struct dentry *btrfs_get_parent(struct dentry *child); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c new file mode 100644 index 000000000000..83cb0378096f --- /dev/null +++ b/fs/btrfs/extent-io-tree.c @@ -0,0 +1,1674 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include <trace/events/btrfs.h> +#include "ctree.h" +#include "extent-io-tree.h" +#include "btrfs_inode.h" +#include "misc.h" + +static struct kmem_cache *extent_state_cache; + +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + +#ifdef CONFIG_BTRFS_DEBUG +static LIST_HEAD(states); +static DEFINE_SPINLOCK(leak_lock); + +static inline void btrfs_leak_debug_add_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_leak_debug_del_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), + refcount_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add_state(state) do {} while (0) +#define btrfs_leak_debug_del_state(state) do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#endif + +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, + void *private_data) +{ + tree->fs_info = fs_info; + tree->state = RB_ROOT; + spin_lock_init(&tree->lock); + tree->private_data = private_data; + tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + /* + * The given mask might be not appropriate for the slab allocator, + * drop the unsupported bits + */ + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + RB_CLEAR_NODE(&state->rb_node); + btrfs_leak_debug_add_state(state); + refcount_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); + return state; +} + +static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (refcount_dec_and_test(&state->refs)) { + WARN_ON(extent_state_in_tree(state)); + btrfs_leak_debug_del_state(state); + trace_free_extent_state(state, _RET_IP_); + kmem_cache_free(extent_state_cache, state); + } +} + +static int add_extent_changeset(struct extent_state *state, u32 bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return 0; + if (set && (state->state & bits) == bits) + return 0; + if (!set && (state->state & bits) == 0) + return 0; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(&changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + return ret; +} + +static inline struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +static inline struct extent_state *prev_state(struct extent_state *state) +{ + struct rb_node *next = rb_prev(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +/* + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @node_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, + * containing @offset + * + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + */ +static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct extent_state *entry = NULL; + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + if (node_ret) + *node_ret = node; + if (parent_ret) + *parent_ret = prev; + + /* Search neighbors until we find the first one past the end */ + while (entry && offset > entry->end) + entry = next_state(entry); + + return entry; +} + +/* + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct extent_state **prev_ret, + struct extent_state **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct extent_state *orig_prev; + struct extent_state *entry = NULL; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + entry = rb_entry(*node, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + orig_prev = entry; + while (entry && offset > entry->end) + entry = next_state(entry); + *next_ret = entry; + entry = orig_prev; + + while (entry && offset < entry->start) + entry = prev_state(entry); + *prev_ret = entry; + + return NULL; +} + +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree->fs_info, err, + "locking error: extent tree was modified by another thread while locked"); +} + +/* + * Utility function to look for merge candidates inside a given range. Any + * extents with matching state are merged together into a single extent in the + * tree. Extents with EXTENT_IO in their state field are not merged because + * the end_io handlers need to be able to do operations on them without + * sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static void merge_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *other; + + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + return; + + other = prev_state(state); + if (other && other->end == state->start - 1 && + other->state == state->state) { + if (tree->private_data) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); + state->start = other->start; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + other = next_state(state); + if (other && other->start == state->end + 1 && + other->state == state->state) { + if (tree->private_data) + btrfs_merge_delalloc_extent(tree->private_data, state, + other); + state->end = other->end; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + u32 bits_to_set = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->private_data) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + + ret = add_extent_changeset(state, bits_to_set, changeset, 1); + BUG_ON(ret < 0); + state->state |= bits_to_set; +} + +/* + * Insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + struct rb_node **node; + struct rb_node *parent; + const u64 end = state->end; + + set_state_bits(tree, state, bits, changeset); + + node = &tree->state.rb_node; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } + } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + + merge_state(tree, state); + return 0; +} + +/* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* + * Split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *parent = NULL; + struct rb_node **node; + + if (tree->private_data) + btrfs_split_delalloc_extent(tree->private_data, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } + } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + + return 0; +} + +/* + * Utility function to clear some bits in an extent state struct. It will + * optionally wake up anyone waiting on this state (wake == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, int wake, + struct extent_changeset *changeset) +{ + struct extent_state *next; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->private_data) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); + BUG_ON(ret < 0); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + next = next_state(state); + if (extent_state_in_tree(state)) { + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + next = next_state(state); + } + return next; +} + +/* + * Clear some bits on a range in the tree. This may require splitting or + * inserting elements in the tree, so the gfp mask is used to indicate which + * allocations or sleeping are allowed. + * + * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given + * range from the tree regardless of state (ie for truncate). + * + * The range [start, end] is inclusive. + * + * This takes the tree lock, and returns 0 on success and < 0 on error. + */ +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + u64 last_end; + int err; + int clear = 0; + int wake; + int delete = (bits & EXTENT_CLEAR_ALL_BITS); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); + + if (delete) + bits |= ~EXTENT_CTLBITS; + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + wake = (bits & EXTENT_LOCKED) ? 1 : 0; + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { + if (clear) + refcount_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + + /* This search will find the extents that end after our range starts. */ + state = tree_search(tree, start); + if (!state) + goto out; +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* The state doesn't have the wanted bits, go ahead. */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we clear the desired bit + * on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state = clear_state_bit(tree, state, bits, wake, changeset); + goto next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + if (wake) + wake_up(&state->wq); + + clear_state_bit(tree, prealloc, bits, wake, changeset); + + prealloc = NULL; + goto out; + } + + state = clear_state_bit(tree, state, bits, wake, changeset); +next: + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && state && !need_resched()) + goto hit_next; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + +} + +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); +} + +/* + * Wait for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) +{ + struct extent_state *state; + + btrfs_debug_check_extent_io_range(tree, start, end); + + spin_lock(&tree->lock); +again: + while (1) { + /* + * This search will find all the extents that end after our + * range starts. + */ + state = tree_search(tree, start); +process_node: + if (!state) + break; + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + refcount_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (!cond_resched_lock(&tree->lock)) { + state = next_state(state); + goto process_node; + } + } +out: + spin_unlock(&tree->lock); +} + +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) +{ + if (cached_ptr && !(*cached_ptr)) { + if (!flags || (state->state & flags)) { + *cached_ptr = state; + refcount_inc(&state->refs); + } + } +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_LOCKED | EXTENT_BOUNDARY); +} + +/* + * Find the first state struct with 'bits' set after 'start', and return it. + * tree->lock must be held. NULL will returned if nothing was found after + * 'start'. + */ +static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, u32 bits) +{ + struct extent_state *state; + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, start); + while (state) { + if (state->end >= start && (state->state & bits)) + return state; + state = next_state(state); + } + return NULL; +} + +/* + * Find the first offset in the io tree with one or more @bits set. + * + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && extent_state_in_tree(state)) { + while ((state = next_state(state)) != NULL) { + if (state->state & bits) + goto got_it; + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state_if_flags(state, cached_state, 0); + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous range of bytes in the file marked as delalloc, not more + * than 'max_bytes'. start and end are used to return the range, + * + * True is returned if we find something, false if nothing was in the tree. + */ +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct extent_state *state; + u64 cur_start = *start; + bool found = false; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + if (!state) { + *end = (u64)-1; + goto out; + } + + while (state) { + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + refcount_inc(&state->refs); + } + found = true; + *end = state->end; + cur_start = state->end + 1; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + state = next_state(state); + } +out: + spin_unlock(&tree->lock); + return found; +} + +/* + * Set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. + * + * [start, end] is inclusive This takes the tree lock. + */ +static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **cached_state, + struct extent_changeset *changeset, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + u32 exclusive_bits = (bits & EXTENT_LOCKED); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL); +again: + if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, changeset); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, NULL, cached_state, + NULL, mask); +} + +/* + * Convert all bits in a given range from one bit to another + * + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie. + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + bool first_iteration = true; + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); + +again: + if (!prealloc) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ + prealloc = alloc_extent_state(GFP_NOFS); + if (!prealloc && !first_iteration) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going. + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, NULL); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; +} + +/* + * Find the first range that has @bits not set. This range could start before + * @start. + * + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + struct extent_state *prev = NULL, *next; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + state = tree_search_prev_next(tree, start, &prev, &next); + if (!state && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!state && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + *start_ret = prev->end + 1; + *end_ret = -1; + goto out; + } else if (!state) { + state = next; + } + + /* + * At this point 'state' either contains 'start' or start is + * before 'state' + */ + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as the + * beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) + *start_ret = prev->end + 1; + else + *start_ret = 0; + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (state) { + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + state = next_state(state); + } +out: + spin_unlock(&tree->lock); +} + +/* + * Count the number of bytes in the tree that have a given bit(s) set. This + * can be fairly slow, except for EXTENT_DIRTY which is cached. The total + * number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig) +{ + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (WARN_ON(search_end <= cur_start)) + return 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + while (state) { + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + state = next_state(state); + } + spin_unlock(&tree->lock); + return total_bytes; +} + +/* + * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 + * is returned if any bit in the range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && extent_state_in_tree(cached) && cached->start <= start && + cached->end > start) + state = cached; + else + state = tree_search(tree, start); + while (state && start <= end) { + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + state = next_state(state); + } + + /* We ran out of states and were still inside of our range. */ + if (filled && !state) + bitset = 0; + spin_unlock(&tree->lock); + return bitset; +} + +/* Wrappers around set/clear extent bit */ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset, + GFP_NOFS); +} + +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, + changeset); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + NULL, NULL, GFP_NOFS); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, NULL); + return 0; + } + return 1; +} + +/* + * Either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + cached_state, NULL, GFP_NOFS); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + &failed_start, cached_state, NULL, + GFP_NOFS); + } + return err; +} + +void __cold extent_state_free_cachep(void) +{ + btrfs_extent_state_leak_debug_check(); + kmem_cache_destroy(extent_state_cache); +} + +int __init extent_state_init_cachep(void) +{ + extent_state_cache = kmem_cache_create("btrfs_extent_state", + sizeof(struct extent_state), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 04083ee5ae6e..a855f40dd61d 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,7 +17,6 @@ struct io_failure_record; #define EXTENT_NODATASUM (1U << 7) #define EXTENT_CLEAR_META_RESV (1U << 8) #define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_DAMAGED (1U << 10) #define EXTENT_NORESERVE (1U << 11) #define EXTENT_QGROUP_RESERVED (1U << 12) #define EXTENT_CLEAR_DATA_RESV (1U << 13) @@ -35,10 +34,18 @@ struct io_failure_record; * delalloc bytes decremented, in an atomic way to prevent races with stat(2). */ #define EXTENT_ADD_INODE_BYTES (1U << 15) + +/* + * Set during truncate when we're clearing an entire range and we just want the + * extent states to go away. + */ +#define EXTENT_CLEAR_ALL_BITS (1U << 16) + #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ - EXTENT_ADD_INODE_BYTES) + EXTENT_ADD_INODE_BYTES | \ + EXTENT_CLEAR_ALL_BITS) /* * Redefined bits above which are used only in the device allocation tree, @@ -56,7 +63,6 @@ enum { IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, - IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, @@ -70,8 +76,6 @@ struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; void *private_data; - u64 dirty_bytes; - bool track_uptodate; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -89,33 +93,23 @@ struct extent_state { refcount_t refs; u32 state; - struct io_failure_record *failrec; - #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; -int __init extent_state_cache_init(void); -void __cold extent_state_cache_exit(void); - void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner, void *private_data); void extent_io_tree_release(struct extent_io_tree *tree); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); - -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, NULL); -} +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int __init extent_io_init(void); -void __cold extent_io_exit(void); +int __init extent_state_init_cachep(void); +void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, @@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached, gfp_t mask, - struct extent_changeset *changeset); + u32 bits, struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, + GFP_NOFS, NULL); } -static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, + GFP_NOFS, NULL); } -static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, - u64 start, u64 end, struct extent_state **cached) +static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_ATOMIC, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, + GFP_ATOMIC, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); + return clear_extent_bit(tree, start, end, bits, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, unsigned exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset); -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits); + u32 bits, struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) +{ + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); +} static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - NULL); + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL, - mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, cached); + EXTENT_DO_ACCOUNTING, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | extra_bits, + cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | EXTENT_DEFRAG, + cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL, - GFP_NOFS, NULL); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - cached_state, mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, mask); } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); - -/* This should be reworked in the future and put elsewhere. */ -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start); -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, - u64 end); -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec); -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..2801c991814f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -598,7 +598,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); - *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -896,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1072,8 +1077,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1121,7 +1125,6 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1166,8 +1169,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1181,21 +1183,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else ret = btrfs_del_item(trans, root, path); - } return ret; } @@ -1247,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1264,14 +1262,14 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } return ret; } -static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1299,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -1318,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_io_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bioc, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bioc->stripes; - for (i = 0; i < bioc->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct btrfs_device *device = stripe->dev; - if (!device->bdev) { + if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bioc(bioc); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bioc(bioc); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } @@ -1585,12 +1567,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2188,7 +2170,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, - int level, int is_data) + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2200,7 +2182,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); @@ -2239,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { + if (path->nowait) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); @@ -2365,15 +2352,10 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, offset, bytenr, strict); @@ -2384,7 +2366,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); + btrfs_release_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; @@ -2505,24 +2487,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; + struct rb_node *leftmost; + u64 bytenr = 0; - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - if (bytenr < (u64)-1) - return bytenr; - - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; - - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } @@ -2578,17 +2557,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, return -EINVAL; /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. + * Fully cache the free space first so that our pin removes the free space + * from the cache. */ - btrfs_cache_block_group(cache, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) goto out; @@ -2611,12 +2583,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(block_group); + ret = btrfs_cache_block_group(block_group, true); if (ret) goto out; @@ -2725,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - down_read(&fs_info->commit_root_sem); - if (start < cache->last_byte_to_unpin && return_free_space) { - u64 add_len = min(len, cache->last_byte_to_unpin - start); - - btrfs_add_free_space(cache, start, add_len); - } - up_read(&fs_info->commit_root_sem); + if (return_free_space) + btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; @@ -2766,12 +2728,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2862,6 +2823,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + /* * Drop one or more refs of @node. * @@ -2943,7 +2933,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); extent_root = btrfs_extent_root(info, bytenr); @@ -3010,8 +2999,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, is_data, - &last_ref); + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3136,8 +3124,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, is_data, - &last_ref); + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3182,7 +3169,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3191,28 +3177,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - struct btrfs_root *csum_root; - csum_root = btrfs_csum_root(info, bytenr); - ret = btrfs_del_csums(trans, csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); @@ -3330,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } /* - * If this is a leaf and there are tree mod log users, we may - * have recorded mod log operations that point to this leaf. - * So we must make sure no one reuses this leaf's extent before - * mod log operations are applied to a node, otherwise after - * rewinding a node using the mod log operations we get an - * inconsistent btree, as the leaf's extent may now be used as - * a node or leaf for another different btree. + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins, it will not be able to - * find a node pointing to this leaf and record operations that - * point to this leaf. + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. */ - if (btrfs_header_level(buf) == 0 && - test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { @@ -3808,8 +3774,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); - if (block_group->ro || - block_group->alloc_offset == block_group->zone_capacity) { + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3841,7 +3806,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { ret = 1; goto out; } @@ -3903,8 +3869,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3974,23 +3956,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } -static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - return true; + return 0; case BTRFS_EXTENT_ALLOC_ZONED: - /* - * If we have enough free space left in an already - * active block group and we can't activate any other - * zone now, do not allow allocating a new chunk and - * let find_free_extent() retry with a smaller size. - */ - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return false; - return true; + return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } @@ -4072,8 +4094,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, int exist = 0; /*Check if allocation policy allows to create a new chunk */ - if (!can_allocate_chunk(fs_info, ffe_ctl)) - return -ENOSPC; + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; trans = current->journal_info; if (trans) @@ -4087,7 +4110,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) @@ -4277,7 +4300,7 @@ static noinline int find_free_extent(struct btrfs_root *root, return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, - first_logical_byte(fs_info, 0)); + first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, @@ -4367,7 +4390,7 @@ have_block_group: ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; - ret = btrfs_cache_block_group(block_group, 0); + ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to @@ -4605,6 +4628,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, @@ -4665,18 +4710,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4694,7 +4728,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4704,12 +4737,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); @@ -4754,22 +4785,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -4842,6 +4858,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; + u64 lockdep_owner = owner; buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) @@ -4861,11 +4878,29 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + * + * The exception however is in replace_path() in relocation, where we + * hold the lock on the original fs root and then search for the reloc + * root. At that point we need to make sure any reloc root buffers are + * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make + * lockdep happy. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && + !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + /* btrfs_clean_tree_block() accesses generation field. */ + btrfs_set_header_generation(buf, trans->transid); + + /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is * set to the appropriate level and owner. */ - btrfs_set_buffer_lockdep_class(owner, buf, level); + btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4971,7 +5006,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, @@ -5156,7 +5190,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb), 0); + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5611,6 +5645,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { + const bool is_reloc_root = (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5622,6 +5658,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5701,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5767,6 +5806,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, @@ -5801,7 +5843,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { @@ -5833,12 +5875,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_put_root(root); root_dropped = true; out_end_trans: + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); out_free: kfree(wc); btrfs_free_path(path); out: /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we @@ -5983,13 +6035,13 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ @@ -6027,8 +6079,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) break; } - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the @@ -6119,13 +6171,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { - ret = btrfs_cache_block_group(cache, 0); - if (ret) { - bg_failed++; - bg_ret = ret; - continue; - } - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 409bad3928db..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -28,39 +29,29 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" -static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set btrfs_bioset; - -static inline bool extent_state_in_tree(const struct extent_state *state) -{ - return !RB_EMPTY_NODE(&state->rb_node); -} #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(states); -static DEFINE_SPINLOCK(leak_lock); - -static inline void btrfs_leak_debug_add(spinlock_t *lock, - struct list_head *new, - struct list_head *head) +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_add(new, head); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } -static inline void btrfs_leak_debug_del(spinlock_t *lock, - struct list_head *entry) +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_del(entry); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) @@ -75,6 +66,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -88,51 +80,22 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } - -static inline void btrfs_extent_state_leak_debug_check(void) -{ - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", - state->start, state->end, state->state, - extent_state_in_tree(state), - refcount_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - } -} - -#define btrfs_debug_check_extent_io_range(tree, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) -static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct extent_io_tree *tree, u64 start, u64 end) -{ - struct inode *inode = tree->private_data; - u64 isize; - - if (!inode || !is_data_inode(inode)) - return; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} #else -#define btrfs_leak_debug_add(lock, new, head) do {} while (0) -#define btrfs_leak_debug_del(lock, entry) do {} while (0) -#define btrfs_extent_state_leak_debug_check() do {} while (0) -#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + int mirror_num; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; + btrfs_bio_end_io_t end_io_func; }; struct extent_page_data { @@ -146,92 +109,59 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, u32 bits, - struct extent_changeset *changeset, - int set) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - int ret; - - if (!changeset) - return 0; - if (set && (state->state & bits) == bits) - return 0; - if (!set && (state->state & bits) == 0) - return 0; - changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; -} + struct bio *bio; + struct bio_vec *bv; + struct inode *inode; + int mirror_num; -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) -{ - blk_status_t ret = 0; - struct extent_io_tree *tree = bio->bi_private; + if (!bio_ctrl->bio) + return; - bio->bi_private = NULL; + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); - else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - return blk_status_to_errno(ret); -} + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); + else + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } + /* The bio is owned by the end_io handler now */ + bio_ctrl->bio = NULL; } /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; - if (bio) { - ret = submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ epd->bio_ctrl.bio = NULL; + } else { + submit_one_bio(&epd->bio_ctrl); } - return ret; -} - -int __init extent_state_cache_init(void) -{ - extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - return 0; } -int __init extent_io_init(void) +int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, @@ -239,32 +169,10 @@ int __init extent_io_init(void) if (!extent_buffer_cache) return -ENOMEM; - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - goto free_buffer_cache; - - if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) - goto free_bioset; - return 0; - -free_bioset: - bioset_exit(&btrfs_bioset); - -free_buffer_cache: - kmem_cache_destroy(extent_buffer_cache); - extent_buffer_cache = NULL; - return -ENOMEM; } -void __cold extent_state_cache_exit(void) -{ - btrfs_extent_state_leak_debug_check(); - kmem_cache_destroy(extent_state_cache); -} - -void __cold extent_io_exit(void) +void __cold extent_buffer_free_cachep(void) { /* * Make sure all delayed rcu free are flushed before we @@ -272,1222 +180,6 @@ void __cold extent_io_exit(void) */ rcu_barrier(); kmem_cache_destroy(extent_buffer_cache); - bioset_exit(&btrfs_bioset); -} - -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; - -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) -{ - tree->fs_info = fs_info; - tree->state = RB_ROOT; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - tree->private_data = private_data; - tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); -} - -void extent_io_tree_release(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - - /* - * The given mask might be not appropriate for the slab allocator, - * drop the unsupported bits - */ - mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->failrec = NULL; - RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); - refcount_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (refcount_dec_and_test(&state->refs)) { - WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&leak_lock, &state->leak_list); - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -/** - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret: points to entry which would have been the parent of the entry, - * containing @offset - * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. - */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - - while (*n) { - prev = *n; - entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; - - if (offset < entry->start) - n = &(*n)->rb_left; - else if (offset > entry->end) - n = &(*n)->rb_right; - else - return *n; - } - - if (p_ret) - *p_ret = n; - if (parent_ret) - *parent_ret = prev; - - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; - } - - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; -} - -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) -{ - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; -} - -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, - struct extent_changeset *changeset); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, - u32 *bits, struct extent_changeset *changeset) -{ - struct rb_node *node; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; - - set_state_bits(tree, state, bits, changeset); - - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; - } - merge_state(tree, state); - return 0; -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *node; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_split_delalloc_extent(tree->private_data, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; - } - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up anyone waiting on this state (wake == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - u32 *bits, int wake, - struct extent_changeset *changeset) -{ - struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); - - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); - - if (bits & EXTENT_DELALLOC) - bits |= EXTENT_NORESERVE; - - if (delete) - bits |= ~EXTENT_CTLBITS; - - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && extent_state_in_tree(cached) && - cached->start <= start && cached->end > start) { - if (clear) - refcount_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); - goto next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, &bits, wake, changeset); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, &bits, wake, changeset); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - struct extent_state *state; - struct rb_node *node; - - btrfs_debug_check_extent_io_range(tree, start, end); - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); -process_node: - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - refcount_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (!cond_resched_lock(&tree->lock)) { - node = rb_next(node); - goto process_node; - } - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) -{ - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; - int ret; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_set_delalloc_extent(tree->private_data, state, bits); - - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); - state->state |= bits_to_set; -} - -static void cache_state_if_flags(struct extent_state *state, - struct extent_state **cached_ptr, - unsigned flags) -{ - if (cached_ptr && !(*cached_ptr)) { - if (!flags || (state->state & flags)) { - *cached_ptr = state; - refcount_inc(&state->refs); - } - } -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - u32 exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - - if (exclusive_bits) - ASSERT(failed_start); - else - ASSERT(failed_start == NULL); -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, &bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - /* - * If this extent already has all the bits we want set, then - * skip it, not necessary to split it or do anything with it. - */ - if ((state->state & bits) == bits) { - start = state->end + 1; - cache_state(state, cached_state); - goto search_again; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits, changeset); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -} - -/** - * convert_extent_bit - convert all bits in a given range from one bit to - * another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @cached_state: state that we're going to cache - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - * - * All allocations are done with GFP_NOFS. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - bool first_iteration = true; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, - clear_bits); - -again: - if (!prealloc) { - /* - * Best effort, don't worry if extent state allocation fails - * here for the first iteration. We might have a cached state - * that matches exactly the target range, in which case no - * extent state allocations are needed. We'll only know this - * after locking the tree. - */ - prealloc = alloc_extent_state(GFP_NOFS); - if (!prealloc && !first_iteration) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits, NULL); - cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - cond_resched(); - first_iteration = false; - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; -} - -/* wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. - */ - BUG_ON(bits & EXTENT_LOCKED); - - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); -} - -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); -} - -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, GFP_NOFS, NULL); -} - -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * Don't support EXTENT_LOCKED case, same reason as - * set_record_extent_bits(). - */ - BUG_ON(bits & EXTENT_LOCKED); - - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, - changeset); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) -{ - int err; - u64 failed_start; - - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL); - return 0; - } - return 1; } void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) @@ -1507,307 +199,18 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; - } -} - -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * Find the first offset in the io tree with one or more @bits set. - * - * Note: If there are multiple bits set in @bits, any of them will match. - * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->end == start - 1 && extent_state_in_tree(state)) { - while ((state = next_state(state)) != NULL) { - if (state->state & bits) - goto got_it; - } - free_extent_state(*cached_state); - *cached_state = NULL; - goto out; - } - free_extent_state(*cached_state); - *cached_state = NULL; - } - - state = find_first_extent_bit_state(tree, start, bits); -got_it: - if (state) { - cache_state_if_flags(state, cached_state, 0); - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } -out: - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find a contiguous area of bits - * - * @tree: io tree to check - * @start: offset to start the search from - * @start_ret: the first offset we found with the bits set - * @end_ret: the final contiguous range of the bits that were set - * @bits: bits to look for - * - * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges - * to set bits appropriately, and then merge them again. During this time it - * will drop the tree->lock, so use this helper if you want to find the actual - * contiguous area for given bits. We will search to the first bit we find, and - * then walk down the tree until we find a non-contiguous area. The area - * returned will be the full contiguous area with the bits set. - */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - while ((state = next_state(state)) != NULL) { - if (state->start > (*end_ret + 1)) - break; - *end_ret = state->end; - } - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find the first range that has @bits not set. This range could start before - * @start. - * - * @tree: the tree to search - * @start: offset at/after which the found extent should start - * @start_ret: records the beginning of the range - * @end_ret: records the end of the range (inclusive) - * @bits: the set of bits which must be unset - * - * Since unallocated range is also considered one which doesn't have the bits - * set it's possible that @end_ret contains -1, this happens in case the range - * spans (last_range_end, end of device]. In this case it's up to the caller to - * trim @end_ret to the appropriate size. - */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - struct rb_node *node, *prev = NULL, *next; - - spin_lock(&tree->lock); - - /* Find first extent with bits cleared */ - while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node && !next && !prev) { - /* - * Tree is completely empty, send full range and let - * caller deal with it - */ - *start_ret = 0; - *end_ret = -1; - goto out; - } else if (!node && !next) { - /* - * We are past the last allocated chunk, set start at - * the end of the last extent. - */ - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } else if (!node) { - node = next; - } - /* - * At this point 'node' either contains 'start' or start is - * before 'node' - */ - state = rb_entry(node, struct extent_state, rb_node); - - if (in_range(start, state->start, state->end - state->start + 1)) { - if (state->state & bits) { - /* - * |--range with bits sets--| - * | - * start - */ - start = state->end + 1; - } else { - /* - * 'start' falls within a range that doesn't - * have the bits set, so take its start as - * the beginning of the desired range - * - * |--range with bits cleared----| - * | - * start - */ - *start_ret = state->start; - break; - } - } else { - /* - * |---prev range---|---hole/unset---|---node range---| - * | - * start - * - * or - * - * |---hole/unset--||--first node--| - * 0 | - * start - */ - if (prev) { - state = rb_entry(prev, struct extent_state, - rb_node); - *start_ret = state->end + 1; - } else { - *start_ret = 0; - } - break; - } - } - - /* - * Find the longest stretch from start until an entry which has the - * bits set - */ - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && !(state->state & bits)) { - *end_ret = state->end; - } else { - *end_ret = state->start - 1; - break; - } - - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * true is returned if we find something, false if nothing was in the tree - */ -bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, - u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - bool found = false; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - refcount_inc(&state->refs); - } - found = true; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - if (!node) - break; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } -out: - spin_unlock(&tree->lock); - return found; } /* @@ -1867,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1878,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1897,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: @@ -1992,10 +392,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; @@ -2059,14 +461,14 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state); + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -2083,210 +485,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops, NULL); } -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) +static int insert_failrec(struct btrfs_inode *inode, + struct io_failure_record *failrec) { - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; - - if (WARN_ON(search_end <= cur_start)) - return 0; - - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; -} + struct rb_node *exist; -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; + spin_lock(&inode->io_failure_lock); + exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, + &failrec->rb_node); + spin_unlock(&inode->io_failure_lock); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->failrec = failrec; -out: - spin_unlock(&tree->lock); - return ret; + return (exist == NULL) ? 0 : -EEXIST; } -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) +static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) { struct rb_node *node; - struct extent_state *state; - struct io_failure_record *failrec; + struct io_failure_record *failrec = ERR_PTR(-ENOENT); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - - failrec = state->failrec; -out: - spin_unlock(&tree->lock); + spin_lock(&inode->io_failure_lock); + node = rb_simple_search(&inode->io_failure_tree, start); + if (node) + failrec = rb_entry(node, struct io_failure_record, rb_node); + spin_unlock(&inode->io_failure_lock); return failrec; } -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +static void free_io_failure(struct btrfs_inode *inode, + struct io_failure_record *rec) { - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && extent_state_in_tree(cached) && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec) -{ - int ret; - int err = 0; - - set_state_failrec(failure_tree, rec->start, NULL); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret) - err = ret; - - ret = clear_extent_bits(io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED); - if (ret && !err) - err = ret; + spin_lock(&inode->io_failure_lock); + rb_erase(&rec->rb_node, &inode->io_failure_tree); + spin_unlock(&inode->io_failure_lock); kfree(rec); - return err; } /* @@ -2303,12 +541,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2316,8 +555,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2335,52 +572,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2406,28 +641,36 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset) +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset) { - u64 private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 ino = btrfs_ino(inode); + u64 locked_start, locked_end; struct io_failure_record *failrec; - struct extent_state *state; - int num_copies; + int mirror; int ret; - private = 0; - ret = count_range_bits(failure_tree, &private, (u64)-1, 1, - EXTENT_DIRTY, 0); - if (!ret) - return 0; - - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(inode, start); if (IS_ERR(failrec)) return 0; @@ -2436,26 +679,21 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (sb_rdonly(fs_info->sb)) goto out; - spin_lock(&io_tree->lock); - state = find_first_extent_bit_state(io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&io_tree->lock); - - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, + &locked_end, EXTENT_LOCKED, NULL); + if (ret || locked_start > failrec->bytenr || + locked_end < failrec->bytenr + failrec->len - 1) + goto out; -out: - free_io_failure(failure_tree, io_tree, failrec); + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); +out: + free_io_failure(inode, failrec); return 0; } @@ -2467,56 +705,50 @@ out: */ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; - struct extent_state *state, *next; + struct rb_node *node, *next; - if (RB_EMPTY_ROOT(&failure_tree->state)) + if (RB_EMPTY_ROOT(&inode->io_failure_tree)) return; - spin_lock(&failure_tree->lock); - state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); - while (state) { - if (state->start > end) + spin_lock(&inode->io_failure_lock); + node = rb_simple_search_first(&inode->io_failure_tree, start); + while (node) { + failrec = rb_entry(node, struct io_failure_record, rb_node); + if (failrec->bytenr > end) break; - ASSERT(state->end <= end); - - next = next_state(state); - - failrec = state->failrec; - free_extent_state(state); + next = rb_next(node); + rb_erase(&failrec->rb_node, &inode->io_failure_tree); kfree(failrec); - state = next; + node = next; } - spin_unlock(&failure_tree->lock); + spin_unlock(&inode->io_failure_lock); } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(BTRFS_I(inode), start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->start, failrec->len); + failrec->logical, failrec->bytenr, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == bbio->mirror_num); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2524,53 +756,34 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode if (!failrec) return ERR_PTR(-ENOMEM); - failrec->start = start; + RB_CLEAR_NODE(&failrec->rb_node); + failrec->bytenr = start; failrec->len = sectorsize; - failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } + btrfs_debug(fs_info, + "new io failure record logical %llu start %llu", + failrec->logical, start); - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); kfree(failrec); return ERR_PTR(-EIO); } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) { - ret = set_state_failrec(failure_tree, start, failrec); - /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, start + sectorsize - 1, - EXTENT_DAMAGED); - } else if (ret < 0) { + ret = insert_failrec(BTRFS_I(inode), failrec); + if (ret) { kfree(failrec); return ERR_PTR(ret); } @@ -2578,90 +791,50 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { - free_io_failure(failure_tree, tree, failrec); + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } - repair_bio = btrfs_bio_alloc(1); + repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, + failed_bbio->private); repair_bbio = btrfs_bio(repair_bio); - repair_bio->bi_opf = REQ_OP_READ; - repair_bio->bi_end_io = failed_bio->bi_end_io; + repair_bbio->file_offset = start; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - repair_bio->bi_private = failed_bio->bi_private; if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; @@ -2678,13 +851,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -2709,26 +882,44 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static void end_sector_io(struct page *page, u64 offset, bool uptodate) { + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); +} + +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, + u32 bio_offset, const struct bio_vec *bvec, + unsigned int error_bitmap) +{ + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); + + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2737,12 +928,11 @@ static blk_status_t submit_read_repair(struct inode *inode, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2755,10 +945,9 @@ static blk_status_t submit_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2769,24 +958,12 @@ static blk_status_t submit_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2824,8 +1001,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio) +static void end_bio_extent_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; @@ -2925,11 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - if (processed->uptodate && tree->track_uptodate) - set_extent_uptodate(tree, processed->start, processed->end, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, processed->start, processed->end, - &cached); + unlock_extent_atomic(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -2942,7 +1116,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2964,7 +1138,7 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } @@ -2989,11 +1163,10 @@ static struct extent_buffer *find_extent_buffer_readpage( * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio) +static void end_bio_extent_readpage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct btrfs_bio *bbio = btrfs_bio(bio); - struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* * The offset to the beginning of a bio, since one bio can never be @@ -3001,7 +1174,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3012,6 +1184,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3020,8 +1193,6 @@ static void end_bio_extent_readpage(struct bio *bio) "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; /* * We always issue full-sector reads, but if some block in a @@ -3049,49 +1220,21 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * btrfs_submit_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3108,14 +1251,44 @@ readpage_ok: zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3123,61 +1296,40 @@ readpage_ok: bio_put(bio); } -/* - * Initialize the members up to but not including 'bio'. Use after allocating a - * new bio by bio_alloc_bioset as it does not initialize the bytes outside of - * 'bio' because use of __GFP_ZERO is not supported. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -} - -/* - * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped * - * The bio allocation is backed by bioset and does not fail. + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. */ -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio)); - return bio; -} - -struct bio *btrfs_bio_clone(struct bio *bio) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) { - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} + unsigned int allocated; -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) -{ - struct bio *bio; - struct btrfs_bio *bbio; + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); - /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); - ASSERT(bio); + if (allocated == nr_pages) + return 0; - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio); + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; + memalloc_retry_wait(GFP_NOFS); + } + return 0; } /** @@ -3189,7 +1341,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3201,25 +1353,50 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig; + bool contig = false; int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + + if (bio->bi_iter.bi_size == 0) { + /* We can always add a page into an empty bio. */ + contig = true; + } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { + struct bio_vec *bvec = bio_last_bvec_all(bio); + + /* + * The contig check requires the following conditions to be met: + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. + */ + if (bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + + bvec->bv_len == page_offset(page) + pg_offset) + contig = true; + } else { + /* + * For compression, all IO should have its logical bytenr + * set to the starting bytenr of the compressed extent. + */ contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; + } + if (!contig) return 0; @@ -3259,7 +1436,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3299,82 +1476,90 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, - unsigned int opf, - bio_end_io_t end_io_func, + blk_opf_t opf, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + ASSERT(bio_ctrl->end_io_func); + + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; - bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; - bio->bi_opf = opf; + bio_ctrl->compress_type = compress_type; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: bio_ctrl->bio = NULL; - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return ret; } /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting - * @page: page to add to the bio * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @bio_ret: must be valid pointer, newly allocated bio will be stored there - * @end_io_func: end_io callback for new bio - * @mirror_num: desired mirror to read/write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio + * + * The will either add the page into the existing @bio_ctrl->bio, or allocate a + * new one in @bio_ctrl->bio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. */ -static int submit_extent_page(unsigned int opf, +static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset, - bio_end_io_t end_io_func, - int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3385,12 +1570,11 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; - } + + ASSERT(bio_ctrl->end_io_func); + + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3399,9 +1583,9 @@ static int submit_extent_page(unsigned int opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, + disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3409,14 +1593,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3430,11 +1614,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3457,7 +1637,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3492,7 +1672,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3509,7 +1689,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3534,7 +1714,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3548,9 +1728,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start) + blk_opf_t read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3560,10 +1740,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; - u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3571,7 +1749,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { - unlock_extent(tree, start, end); + unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); goto out; @@ -3583,9 +1761,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } + bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { unsigned long this_bio_flag = 0; @@ -3598,35 +1776,30 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { - unlock_extent(tree, cur, end); + if (IS_ERR(em)) { + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3684,48 +1857,35 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, + bio_ctrl, disk_bytenr, page, iosize, + pg_offset, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + if (ret) { + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end, NULL); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3735,6 +1895,26 @@ out: return ret; } +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + submit_one_bio(&bio_ctrl); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3753,12 +1933,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3858,7 +2032,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3901,10 +2075,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; - u32 opf = REQ_OP_WRITE; - const unsigned int write_flags = wbc_to_write_flags(wbc); + enum req_op op = REQ_OP_WRITE; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3919,8 +2095,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; + epd->bio_ctrl.end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -3951,9 +2128,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -3974,7 +2154,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) - opf = REQ_OP_ZONE_APPEND; + op = REQ_OP_ZONE_APPEND; free_extent_map(em); em = NULL; @@ -4010,13 +2190,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(opf | write_flags, wbc, - &epd->bio_ctrl, page, - disk_bytenr, iosize, + ret = submit_extent_page(op | write_flags, wbc, + &epd->bio_ctrl, disk_bytenr, + page, iosize, cur - page_offset(page), - end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4030,8 +2213,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4048,6 +2233,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4068,15 +2254,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -4161,9 +2345,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4183,14 +2364,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4200,9 +2379,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4240,7 +2417,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4249,14 +2426,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4264,25 +2434,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4393,14 +2544,15 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct bio *bio) +static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4450,8 +2602,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) bio_put(bio); } -static void end_bio_extent_buffer_writepage(struct bio *bio) +static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; struct extent_buffer *eb; int done; @@ -4517,7 +2670,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4533,10 +2686,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); + epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + &epd->bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4552,7 +2706,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4562,11 +2716,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); + epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4574,10 +2730,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, p, disk_bytenr, - PAGE_SIZE, 0, - end_bio_extent_buffer_writepage, - 0, 0, false); + &epd->bio_ctrl, disk_bytenr, p, + PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4588,7 +2742,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4691,7 +2845,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4727,7 +2881,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4780,11 +2934,11 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -4870,10 +3024,6 @@ retry: index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4900,14 +3050,16 @@ retry: * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. + * + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. */ - if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); - } else { + if (ret > 0) + ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - end_write_bio(&epd, ret); - } -out: + submit_write_bio(&epd, ret); + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5010,8 +3162,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + submit_write_bio(epd, 0); lock_page(page); } @@ -5021,10 +3172,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5064,9 +3213,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + submit_write_bio(epd, 0); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5076,27 +3224,6 @@ retry: return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still @@ -5154,10 +3281,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - ret = flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5182,13 +3306,8 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - ret = flush_write_bio(&epd); return ret; } @@ -5210,25 +3329,21 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + submit_one_bio(&bio_ctrl); } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5237,20 +3352,20 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + lock_extent(tree, start, end, &cached_state); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent_cached(tree, start, end, &cached_state); + unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for releasepage, this tests for areas of the page that + * a helper for release_folio, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ @@ -5264,15 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree, if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + /* * At this point we can safely clear everything except the * locked bit, the nodatasum bit and the delalloc new bit. * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), - 0, 0, NULL, mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, + mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -5286,7 +3403,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, } /* - * a helper for releasepage. As long as there are no locked extents + * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed */ @@ -5371,42 +3488,6 @@ next: } /* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, - u64 offset, u64 last) -{ - u64 sectorsize = btrfs_inode_sectorsize(inode); - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while (1) { - len = last - offset; - if (len == 0) - break; - len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) - return em; - - /* if this isn't a hole return it */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -/* * To cache previous fiemap extent * * Will be used for merging fiemap extent @@ -5435,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, { int ret = 0; + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + if (!cache->cached) goto assign; @@ -5458,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * - * 3) Share same flags except FIEMAP_EXTENT_LAST - * So regular extent won't get merged with prealloc extent + * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && - (cache->flags & ~FIEMAP_EXTENT_LAST) == - (flags & ~FIEMAP_EXTENT_LAST)) { + cache->flags == flags) { cache->len += len; - cache->flags |= flags; - goto try_submit_last; + return 0; } /* Not mergeable, need to submit cached one */ @@ -5482,13 +3563,8 @@ assign: cache->phys = phys; cache->len = len; cache->flags = flags; -try_submit_last: - if (cache->flags & FIEMAP_EXTENT_LAST) { - ret = fiemap_fill_next_extent(fieinfo, cache->offset, - cache->phys, cache->len, cache->flags); - cache->cached = false; - } - return ret; + + return 0; } /* @@ -5518,215 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - int ret = 0; - u64 off; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(&inode->vfs_inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_root *root = inode->root; - struct fiemap_cache cache = { 0 }; - struct ulist *roots; - struct ulist *tmp_ulist; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; - if (len == 0) - return -EINVAL; + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; - path = btrfs_alloc_path(); - if (!path) + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) return -ENOMEM; - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!roots || !tmp_ulist) { - ret = -ENOMEM; - goto out_free_ulist; + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; } /* - * We can't initialize that to 'start' as this could miss extents due - * to extent item merging + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. */ - off = 0; - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct btrfs_backref_shared_cache *backref_cache, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp_ulist, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); - if (ret < 0) { - goto out_free_ulist; - } else { - WARN_ON(!ret); - if (ret == 1) - ret = 0; - } + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = found_key.type; - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. */ - last = found_key.offset; - last_for_get_extent = last + 1; + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); } - btrfs_release_path(path); /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; } - lock_extent_bits(&inode->io_tree, start, start + len - 1, - &cached_state); + return 0; +} - em = get_extent_skip_holes(inode, start, last_for_get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_shared_cache *backref_cache; + struct ulist *roots; + struct ulist *tmp_ulist; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + path = btrfs_alloc_path(); + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!backref_cache || !path || !roots || !tmp_ulist) { + ret = -ENOMEM; goto out; } - while (!end) { - u64 offset_in_extent = 0; + lockstart = round_down(start, root->fs_info->sectorsize); + lockend = round_up(start + len, root->fs_info->sectorsize); + prev_extent_end = lockstart; - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { /* - * record the offset from the start of the extent - * for adjusting the disk offset below. Only do this if the - * extent isn't compressed since our in ram offset may be past - * what we have actually allocated on disk. + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. */ - if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - flags = 0; - if (em->block_start < EXTENT_MAP_LAST_BYTE) - disko = em->block_start + offset_in_extent; - else - disko = 0; + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); /* - * bump off for our next call to get_extent + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else if (fieinfo->fi_extents_max) { - u64 bytenr = em->block_start - - (em->start - em->orig_start); + if (extent_end <= lockstart) + goto next_item; - /* - * As btrfs supports shared space, this information - * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 - * then we're just getting a count and we can skip the - * lookup stuff. - */ - ret = btrfs_check_shared(root, btrfs_ino(inode), - bytenr, roots, tmp_ulist); - if (ret < 0) - goto out_free; - if (ret) - flags |= FIEMAP_EXTENT_SHARED; - ret = 0; + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, + disk_bytenr, extent_offset, + extent_gen, roots, tmp_ulist, + key.offset, extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(root, ino, + disk_bytenr, + extent_gen, + roots, + tmp_ulist, + backref_cache); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); } - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; } - ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, - em_len, flags); - if (ret) { - if (ret == 1) - ret = 0; - goto out_free; + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; + } + cond_resched(); + } + +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + 0, 0, 0, roots, tmp_ulist, + prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; } } -out_free: - if (!ret) - ret = emit_last_fiemap_cache(fieinfo, &cache); - free_extent_map(em); -out: - unlock_extent_cached(&inode->io_tree, start, start + len - 1, - &cached_state); -out_free_ulist: + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); +out: + kfree(backref_cache); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); @@ -5783,7 +4178,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5857,7 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); __free_extent_buffer(eb); } @@ -5874,8 +4269,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; init_rwsem(&eb->lock); - btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, - &fs_info->allocated_ebs); + btrfs_leak_debug_add_eb(eb); INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); @@ -5890,9 +4284,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5905,22 +4299,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5934,31 +4329,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -5980,10 +4380,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or - * calling releasepage when the tree reference is the only reference. + * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's - * pages are not under io. However, releasepage can be concurrently + * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. @@ -6103,7 +4503,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6125,6 +4525,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !PAGE_ALIGNED(start)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6136,13 +4560,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + u64 lockdep_owner = owner_root; int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6155,14 +4578,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6170,7 +4585,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); - btrfs_set_buffer_lockdep_class(owner_root, eb, level); + + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { @@ -6192,7 +4615,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6236,7 +4659,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted in the radix tree, this - * opens a race with btree_releasepage which can free a page + * opens a race with btree_release_folio which can free a page * while we are still filling in all pages for the buffer and * we could crash. */ @@ -6268,7 +4691,7 @@ again: /* * Now it's safe to unlock the pages because any calls to - * btree_releasepage will correctly detect that a page belongs to a + * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ for (i = 0; i < num_pages; i++) @@ -6314,7 +4737,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -6334,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb) void free_extent_buffer(struct extent_buffer *eb) { int refs; - int old; if (!eb) return; + refs = atomic_read(&eb->refs); while (1) { - refs = atomic_read(&eb->refs); if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs == 1)) break; - old = atomic_cmpxchg(&eb->refs, refs, refs - 1); - if (old == refs) + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) return; } @@ -6411,7 +4832,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6443,7 +4864,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6483,9 +4904,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6500,7 +4930,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6510,7 +4949,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6521,7 +4962,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); if (ret < 0) return ret; } @@ -6531,7 +4972,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); return ret; } @@ -6539,14 +4980,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, eb->read_mirror = 0; atomic_set(&eb->io_pages, 1); check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, - page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, + eb->start, page, eb->len, + eb->start - page_offset(page), 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6555,14 +4996,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6582,7 +5016,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6595,7 +5031,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6638,10 +5074,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); /* - * It is possible for releasepage to clear the TREE_REF bit before we + * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -6653,10 +5090,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, - &bio_ctrl, page, page_offset(page), - PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + err = submit_extent_page(REQ_OP_READ, NULL, + &bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6673,12 +5109,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); - bio_ctrl.bio = NULL; - if (err) - return err; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6841,14 +5272,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; - if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ + if (fs_info->nodesize < PAGE_SIZE) { + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } @@ -6942,7 +5383,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6951,7 +5392,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7344,7 +5785,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..7929f054dda3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,15 +7,9 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include "compression.h" #include "ulist.h" -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -32,7 +26,6 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, - EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -64,16 +57,19 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +struct btrfs_bio; struct btrfs_root; struct btrfs_inode; -struct btrfs_io_bio; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; -typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, +int __init extent_buffer_init_cachep(void); +void __cold extent_buffer_free_cachep(void); + +typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, - unsigned long bio_flags); + enum btrfs_compression_type compress_type); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -103,22 +99,11 @@ struct extent_buffer { }; /* - * Structure to record info about the bio being assembled, and other info like - * how many bytes are there before stripe/ordered extent boundary. - */ -struct btrfs_bio_ctrl { - struct bio *bio; - unsigned long bio_flags; - u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; -}; - -/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; @@ -158,32 +143,12 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags); -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start); -int extent_write_full_page(struct page *page, struct writeback_control *wbc); +int btrfs_read_folio(struct file *file, struct folio *folio); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -277,9 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); @@ -293,20 +259,25 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * bio end_io callback is called to indicate things have failed. */ struct io_failure_record { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; struct page *page; - u64 start; u64 len; u64 logical; - unsigned long bio_flags; int this_mirror; int failed_mirror; + int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5a36add21305..6092a4eedc92 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -7,6 +7,7 @@ #include "volumes.h" #include "extent_map.h" #include "compression.h" +#include "btrfs_inode.h" static struct kmem_cache *extent_map_cache; @@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = 0; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(refcount_read(&em->refs) == 0); if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); @@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) * it can't be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct extent_map *entry; struct extent_map *prev_entry = NULL; + ASSERT(prev_or_next_ret); + while (n) { entry = rb_entry(n, struct extent_map, rb_node); prev = n; @@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return n; } - if (prev_ret) { - orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *prev_ret = prev; - prev = orig_prev; + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); } - if (next_ret) { + /* + * Previous extent map found, return as in this case the caller does not + * care about the next one. + */ + if (prev) { + *prev_or_next_ret = prev; + return NULL; + } + + prev = orig_prev; + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *next_ret = prev; } + *prev_or_next_ret = prev; + return NULL; } @@ -261,6 +265,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -278,6 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); free_extent_map(merge); } } @@ -334,6 +340,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); if (extent_map_in_tree(em)) try_merge_map(tree, em); @@ -380,7 +388,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + stripe_size - 1, bits, - 0, 0, NULL, GFP_NOWAIT, NULL); + NULL, GFP_NOWAIT, NULL); } } @@ -423,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree, { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; + struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); if (!rb_node) { - if (prev) - rb_node = prev; - else if (next) - rb_node = next; + if (prev_or_next) + rb_node = prev_or_next; else return NULL; } @@ -490,6 +495,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) @@ -504,6 +511,8 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *new, int modified) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) @@ -652,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, ASSERT(ret == 0 || ret == -EEXIST); return ret; } + +/* + * Drop all extent maps from a tree in the fastest possible way, rescheduling + * if needed. This avoids searching the tree, from the root down to the first + * extent map, before each deletion. + */ +static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +{ + write_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + struct extent_map *em; + struct rb_node *node; + + node = rb_first_cached(&tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(tree, em); + free_extent_map(em); + cond_resched_rwlock_write(&tree->lock); + } + write_unlock(&tree->lock); +} + +/* + * Drop all extent maps in a given range. + * + * @inode: The target inode. + * @start: Start offset of the range. + * @end: End offset of the range (inclusive value). + * @skip_pinned: Indicate if pinned extent maps should be ignored or not. + * + * This drops all the extent maps that intersect the given range [@start, @end]. + * Extent maps that partially overlap the range and extend behind or beyond it, + * are split. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, + bool skip_pinned) +{ + struct extent_map *split; + struct extent_map *split2; + struct extent_map *em; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 len = end - start + 1; + + WARN_ON(end < start); + if (end == (u64)-1) { + if (start == 0 && !skip_pinned) { + drop_all_extent_maps_fast(em_tree); + return; + } + len = (u64)-1; + } else { + /* Make end offset exclusive for use in the loop below. */ + end++; + } + + /* + * It's ok if we fail to allocate the extent maps, see the comment near + * the bottom of the loop below. We only need two spare extent maps in + * the worst case, where the first extent map that intersects our range + * starts before the range and the last extent map that intersects our + * range ends after our range (and they might be the same extent map), + * because we need to split those two extent maps at the boundaries. + */ + split = alloc_extent_map(); + split2 = alloc_extent_map(); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + + while (em) { + /* extent_map_end() returns exclusive value (last byte + 1). */ + const u64 em_end = extent_map_end(em); + struct extent_map *next_em = NULL; + u64 gen; + unsigned long flags; + bool modified; + bool compressed; + + if (em_end < end) { + next_em = next_extent_map(em); + if (next_em) { + if (next_em->start < end) + refcount_inc(&next_em->refs); + else + next_em = NULL; + } + } + + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + start = em_end; + if (end != (u64)-1) + len = start + len - em_end; + goto next; + } + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + + /* + * The extent map does not cross our target range, so no need to + * split it, we can remove it directly. + */ + if (em->start >= start && em_end <= end) + goto remove_em; + + flags = em->flags; + gen = em->generation; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + if (em->start < start) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = em->start; + split->len = start - em->start; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; + split->flags = flags; + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em_end > end) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = start + len; + split->len = em_end - (start + len); + split->block_start = em->block_start; + split->flags = flags; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->orig_start = em->orig_start; + } else { + const u64 diff = start + len - em->start; + + split->block_len = split->len; + split->block_start += diff; + split->orig_start = em->orig_start; + } + } else { + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->orig_block_len = 0; + } + + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + int ret; + + ret = add_extent_mapping(em_tree, split, + modified); + /* Logic error, shouldn't happen. */ + ASSERT(ret == 0); + if (WARN_ON(ret != 0) && modified) + btrfs_set_inode_full_sync(inode); + } + free_extent_map(split); + split = NULL; + } +remove_em: + if (extent_map_in_tree(em)) { + /* + * If the extent map is still in the tree it means that + * either of the following is true: + * + * 1) It fits entirely in our range (doesn't end beyond + * it or starts before it); + * + * 2) It starts before our range and/or ends after our + * range, and we were not able to allocate the extent + * maps for split operations, @split and @split2. + * + * If we are at case 2) then we just remove the entire + * extent map - this is fine since if anyone needs it to + * access the subranges outside our range, will just + * load it again from the subvolume tree's file extent + * item. However if the extent map was in the list of + * modified extents, then we must mark the inode for a + * full fsync, otherwise a fast fsync will miss this + * extent if it's new and needs to be logged. + */ + if ((em->start < start || em_end > end) && modified) { + ASSERT(!split); + btrfs_set_inode_full_sync(inode); + } + remove_extent_mapping(em_tree, em); + } + + /* + * Once for the tree reference (we replaced or removed the + * extent map from the tree). + */ + free_extent_map(em); +next: + /* Once for us (for our lookup reference). */ + free_extent_map(em); + + em = next_em; + } + + write_unlock(&em_tree->lock); + + free_extent_map(split); + free_extent_map(split2); +} + +/* + * Replace a range in the inode's extent map tree with a new extent map. + * + * @inode: The target inode. + * @new_em: The new extent map to add to the inode's extent map tree. + * @modified: Indicate if the new extent map should be added to the list of + * modified extents (for fast fsync tracking). + * + * Drops all the extent maps in the inode's extent map tree that intersect the + * range of the new extent map and adds the new extent map to the tree. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified) +{ + const u64 end = new_em->start + new_em->len - 1; + struct extent_map_tree *tree = &inode->extent_tree; + int ret; + + ASSERT(!extent_map_in_tree(new_em)); + + /* + * The caller has locked an appropriate file range in the inode's io + * tree, but getting -EEXIST when adding the new extent map can still + * happen in case there are extents that partially cover the range, and + * this is due to two tasks operating on different parts of the extent. + * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from + * btrfs_get_extent") for an example and details. + */ + do { + btrfs_drop_extent_map_range(inode, new_em->start, end, false); + write_lock(&tree->lock); + ret = add_extent_mapping(tree, new_em, modified); + write_unlock(&tree->lock); + } while (ret == -EEXIST); + + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e217337dff9..ad311864272a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -25,6 +25,8 @@ enum { EXTENT_FLAG_FILLING, /* filesystem extent mapping type */ EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, }; struct extent_map { @@ -40,6 +42,12 @@ struct extent_map { u64 ram_bytes; u64 block_start; u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ u64 generation; unsigned long flags; /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ @@ -55,6 +63,8 @@ struct extent_map_tree { rwlock_t lock; }; +struct btrfs_inode; + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); @@ -96,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, + u64 start, u64 end, + bool skip_pinned); +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90c5c38836ab..6bb9fa961a6a 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; return clear_extent_bit(&inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, 0, 0, NULL); + start + len - 1, EXTENT_DIRTY, NULL); } static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, @@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, return ncsums * fs_info->sectorsize; } -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, +/* + * Calculate the total size needed to allocate for an ordered sum structure + * spanning @bytes in the file. + */ +static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +{ + int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); + + return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; +} + +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding) + u64 objectid, u64 pos, u64 num_bytes) { int ret = 0; struct btrfs_file_extent_item *item; @@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_disk_bytenr(leaf, item, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, 0); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, compression); - btrfs_set_file_extent_encryption(leaf, item, encryption); - btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + btrfs_set_file_extent_compression(leaf, item, 0); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); btrfs_mark_buffer_dirty(leaf); out: @@ -305,7 +313,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } @@ -368,6 +376,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_bio *bbio = NULL; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; @@ -377,6 +386,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; + blk_status_t ret = BLK_STS_OK; if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -400,7 +410,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_bio *bbio = btrfs_bio(bio); + bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -456,21 +466,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); - if (count <= 0) { - /* - * Either we hit a critical error or we didn't find - * the csum. - * Either way, we put zero into the csums dst, and skip - * to the next sector. - */ + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio) + btrfs_bio_free_csum(bbio); + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; - /* - * For data reloc inode, we need to mark the range - * NODATASUM so that balance won't report false csum - * error. - */ if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; @@ -491,11 +507,12 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } btrfs_free_path(path); - return BLK_STS_OK; + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit) + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -517,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (!path) return -ENOMEM; + path->nowait = nowait; if (search_commit) { path->skip_locking = 1; path->reada = READA_FORWARD; @@ -612,32 +630,33 @@ fail: return ret; } -/* - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio +/** + * Calculate checksums of the data contained inside a bio + * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed - * @file_start: offset in file this bio begins to describe - * @contig: Boolean. If true/1 means all bio vecs in this bio are - * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contain potentially discontiguous bio vecs - * so the logical offset of each should be calculated separately. + * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the + * file offsets are determined from the page offsets in the bio. + * Otherwise, this is the starting file offset of the bio vecs in + * @bio, which must be contiguous. + * @one_ordered: If true, @bio only refers to one ordered extent. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig) + u64 offset, bool one_ordered) { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; + const bool use_page_offsets = (offset == (u64)-1); char *data; struct bvec_iter iter; struct bio_vec bvec; int index; - int nr_sectors; + unsigned int blockcount; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; int i; - u64 offset; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -651,18 +670,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - if (contig) - offset = file_start; - else - offset = 0; /* shut up gcc */ - sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!contig) + if (use_page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { @@ -681,13 +695,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } } - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); - for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->num_bytes || - offset < ordered->file_offset) { + for (i = 0; i < blockcount; i++) { + if (!one_ordered && + !in_range(offset, ordered->file_offset, + ordered->num_bytes)) { unsigned long bytes_left; sums->len = this_sum_bytes; @@ -1211,6 +1226,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, extent_start = key.offset; extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 11204dbbe053..d01631d47806 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -50,11 +50,14 @@ struct inode_defrag { /* root objectid */ u64 root; - /* last offset we were able to defrag */ - u64 last_offset; - - /* if we've wrapped around back to zero once already */ - int cycled; + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, + * thus needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; }; static int __compare_inode_defrag(struct inode_defrag *defrag1, @@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, */ if (defrag->transid < entry->transid) entry->transid = defrag->transid; - if (defrag->last_offset > entry->last_offset) - entry->last_offset = defrag->last_offset; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); return -EEXIST; } } @@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) * enabled */ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { @@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, } /* - * Requeue the defrag object. If there is a defrag object that points to - * the same inode in the tree, we will merge them together (by - * __btrfs_add_inode_defrag()) and free the one that we want to requeue. - */ -static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - - if (!__need_auto_defrag(fs_info)) - goto out; - - /* - * Here we don't check the IN_DEFRAG flag, because we need merge - * them together. - */ - spin_lock(&fs_info->defrag_inodes_lock); - ret = __btrfs_add_inode_defrag(inode, defrag); - spin_unlock(&fs_info->defrag_inodes_lock); - if (ret) - goto out; - return; -out: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); -} - -/* * pick the defragable inode that we want, if it doesn't exist, we will get * the next one. */ @@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_root *inode_root; struct inode *inode; struct btrfs_ioctl_defrag_range_args range; - int num_defrag; - int ret; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; /* get the inode */ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); @@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, goto cleanup; } + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; - range.start = defrag->last_offset; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; sb_start_write(fs_info->sb); - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == BTRFS_DEFRAG_BATCH) { - defrag->last_offset = range.start; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - iput(inode); - return 0; + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + cleanup: kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return ret; @@ -500,7 +473,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, */ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached); + cached); err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -526,159 +499,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, } /* - * this drops all the extents in the cache that intersect the range - * [start, end]. Existing extents are split as required. - */ -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned) -{ - struct extent_map *em; - struct extent_map *split = NULL; - struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; - u64 len = end - start + 1; - u64 gen; - int ret; - int testend = 1; - unsigned long flags; - int compressed = 0; - bool modified; - - WARN_ON(end < start); - if (end == (u64)-1) { - len = (u64)-1; - testend = 0; - } - while (1) { - int no_splits = 0; - - modified = false; - if (!split) - split = alloc_extent_map(); - if (!split2) - split2 = alloc_extent_map(); - if (!split || !split2) - no_splits = 1; - - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - write_unlock(&em_tree->lock); - break; - } - flags = em->flags; - gen = em->generation; - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (testend && em->start + em->len >= start + len) { - free_extent_map(em); - write_unlock(&em_tree->lock); - break; - } - start = em->start + em->len; - if (testend) - len = start + len - (em->start + em->len); - free_extent_map(em); - write_unlock(&em_tree->lock); - continue; - } - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &flags); - modified = !list_empty(&em->list); - if (no_splits) - goto next; - - if (em->start < start) { - split->start = em->start; - split->len = start - em->start; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->orig_block_len = max(split->block_len, - em->orig_block_len); - split->ram_bytes = em->ram_bytes; - } else { - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - split->ram_bytes = split->len; - } - - split->generation = gen; - split->flags = flags; - split->compress_type = em->compress_type; - replace_extent_mapping(em_tree, em, split, modified); - free_extent_map(split); - split = split2; - split2 = NULL; - } - if (testend && em->start + em->len > start + len) { - u64 diff = start + len - em->start; - - split->start = start + len; - split->len = em->start + em->len - (start + len); - split->flags = flags; - split->compress_type = em->compress_type; - split->generation = gen; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_block_len = max(em->block_len, - em->orig_block_len); - - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; - } else { - split->block_len = split->len; - split->block_start = em->block_start - + diff; - split->orig_start = em->orig_start; - } - } else { - split->ram_bytes = split->len; - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - } - - if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); - } else { - ret = add_extent_mapping(em_tree, split, - modified); - ASSERT(ret == 0); /* Logic error */ - } - free_extent_map(split); - split = NULL; - } -next: - if (extent_map_in_tree(em)) - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us */ - free_extent_map(em); - /* once for the tree*/ - free_extent_map(em); - } - if (split) - free_extent_map(split); - if (split2) - free_extent_map(split2); -} - -/* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. @@ -718,7 +538,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int modify_tree = -1; int update_refs; int found = 0; - int leafs_visited = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; @@ -736,7 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, } if (args->drop_cache) - btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); + btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; @@ -756,7 +575,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; - leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -768,7 +586,6 @@ next_slot: ret = 0; break; } - leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -1014,7 +831,7 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && args->replace_extent && leafs_visited == 1 && + if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { @@ -1337,11 +1154,12 @@ static int prepare_uptodate_page(struct inode *inode, struct page *page, u64 pos, bool force_uptodate) { + struct folio *folio = page_folio(page); int ret = 0; if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && !PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, folio); if (ret) return ret; lock_page(page); @@ -1351,8 +1169,8 @@ static int prepare_uptodate_page(struct inode *inode, } /* - * Since btrfs_readpage() will unlock the page before it - * returns, there is a window where btrfs_releasepage() can be + * Since btrfs_read_folio() will unlock the folio before it + * returns, there is a window where btrfs_release_folio() can be * called to release the page. Here we check both inode * mapping and PagePrivate() to make sure the page was not * released. @@ -1368,26 +1186,54 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } +static unsigned int get_prepare_fgp_flags(bool nowait) +{ + unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + + if (nowait) + fgp_flags |= FGP_NOWAIT; + + return fgp_flags; +} + +static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) +{ + gfp_t gfp; + + gfp = btrfs_alloc_write_mask(inode->i_mapping); + if (nowait) { + gfp &= ~__GFP_DIRECT_RECLAIM; + gfp |= GFP_NOWAIT; + } + + return gfp; +} + /* * this just gets pages into the page cache and locks them down. */ static noinline int prepare_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate) + size_t write_bytes, bool force_uptodate, + bool nowait) { int i; unsigned long index = pos >> PAGE_SHIFT; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + gfp_t mask = get_prepare_gfp_flags(inode, nowait); + unsigned int fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; for (i = 0; i < num_pages; i++) { again: - pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask | __GFP_WRITE); + pages[i] = pagecache_get_page(inode->i_mapping, index + i, + fgp_flags, mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; - err = -ENOMEM; + if (nowait) + err = -EAGAIN; + else + err = -ENOMEM; goto fail; } @@ -1405,7 +1251,7 @@ again: pos + write_bytes, false); if (err) { put_page(pages[i]); - if (err == -EAGAIN) { + if (!nowait && err == -EAGAIN) { err = 0; goto again; } @@ -1440,7 +1286,7 @@ static noinline int lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - u64 *lockstart, u64 *lockend, + u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1455,15 +1301,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, start_pos, last_pos, - cached_state); + if (nowait) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) { + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + pages[i] = NULL; + } + + return -EAGAIN; + } + } else { + lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent_cached(&inode->io_tree, start_pos, - last_pos, cached_state); + unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1490,7 +1348,26 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset. + * @write_bytes: The length to write, will be updated to the nocow writeable + * range. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * > 0 If we can nocow, and updates @write_bytes. + * 0 If we can't do a nocow write. + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * root is in progress. + * < 0 If an error happened. + * + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1502,7 +1379,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; - if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); @@ -1511,70 +1388,25 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, num_bytes = lockend - lockstart + 1; if (nowait) { - struct btrfs_ordered_extent *ordered; - - if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) { + btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; - - ordered = btrfs_lookup_ordered_range(inode, lockstart, - num_bytes); - if (ordered) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - goto out_unlock; } } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, false); - if (ret <= 0) { - ret = 0; - if (!nowait) - btrfs_drew_write_unlock(&root->snapshot_lock); - } else { + NULL, NULL, NULL, nowait, false); + if (ret <= 0) + btrfs_drew_write_unlock(&root->snapshot_lock); + else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - } -out_unlock: - unlock_extent(&inode->io_tree, lockstart, lockend); + unlock_extent(&inode->io_tree, lockstart, lockend, NULL); return ret; } -static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, true); -} - -/* - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) - * - * @pos: File offset - * @write_bytes: The length to write, will be updated to the nocow writeable - * range - * - * This function will flush ordered extents in the range to ensure proper - * nocow checks. - * - * Return: - * >0 and update @write_bytes if we can do nocow write - * 0 if we can't do nocow write - * -EAGAIN if we can't get the needed lock or there are ordered extents - * for * (nowait == true) case - * <0 if other error happened - * - * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). - */ -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, false); -} - void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); @@ -1609,20 +1441,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t oldsize; loff_t start_pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* We will allocate space in case nodatacow is not set, so bail */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) - return -EAGAIN; - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) - return -EAGAIN; - } + /* + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or + * prealloc flags, as without those flags we always have to COW. We will + * later check if we can really COW into the target range (using + * can_nocow_extent() at btrfs_get_blocks_direct_write()). + */ + if ((iocb->ki_flags & IOCB_NOWAIT) && + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return -EAGAIN; current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); @@ -1672,8 +1499,10 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); + unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - if (iocb->ki_flags & IOCB_NOWAIT) + if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; ret = btrfs_inode_lock(inode, ilock_flags); @@ -1729,18 +1558,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, - write_bytes); + write_bytes, nowait); if (ret < 0) { + int can_nocow; + + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { + ret = -EAGAIN; + break; + } + /* * If we don't have to COW at the offset, reserve * metadata only. write_bytes may get smaller than * requested here. */ - if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) - only_release_metadata = true; - else + can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) break; + only_release_metadata = true; } num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); @@ -1749,7 +1589,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes); + reserve_bytes, + reserve_bytes, nowait); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1757,19 +1598,27 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, write_bytes); else btrfs_check_nocow_unlock(BTRFS_I(inode)); + + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; break; } release_bytes = reserve_bytes; again: + ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + break; + } + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, - force_page_uptodate); + pos, write_bytes, force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); @@ -1779,10 +1628,11 @@ again: extents_locked = lock_and_cleanup_extent_if_need( BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, - &lockend, &cached_state); + &lockend, nowait, &cached_state); if (extents_locked < 0) { - if (extents_locked == -EAGAIN) + if (!nowait && extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); ret = extents_locked; @@ -1846,8 +1696,8 @@ again: * possible cached extent state to avoid a memory leak. */ if (extents_locked) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); else free_extent_state(cached_state); @@ -1865,8 +1715,6 @@ again: cond_resched(); - balance_dirty_pages_ratelimited(inode->i_mapping); - pos += copied; num_written += copied; } @@ -1912,7 +1760,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { - const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1923,6 +1770,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; + struct iomap_dio *dio; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1966,15 +1814,6 @@ relock: } /* - * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() - * calls generic_write_sync() (through iomap_dio_complete()), because - * that results in calling fsync (btrfs_sync_file()) which will try to - * lock the inode in exclusive/write mode. - */ - if (is_sync_write) - iocb->ki_flags &= ~IOCB_DSYNC; - - /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create @@ -1992,12 +1831,22 @@ relock: * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ -again: from->nofault = true; - err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, written); + dio = btrfs_dio_write(iocb, from, written); from->nofault = false; + /* + * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync + * iocb, and that needs to lock the inode. So unlock it before calling + * iomap_dio_complete() to avoid a deadlock. + */ + btrfs_inode_unlock(inode, ilock_flags); + + if (IS_ERR_OR_NULL(dio)) + err = PTR_ERR_OR_ZERO(dio); + else + err = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ if (err > 0) written = err; @@ -2023,24 +1872,29 @@ again: } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto again; + goto relock; } } - btrfs_inode_unlock(inode, ilock_flags); - /* - * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do - * the fsync (call generic_write_sync()). + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. */ - if (is_sync_write) - iocb->ki_flags |= IOCB_DSYNC; - - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * since we currently don't have NOWAIT semantics support for buffered IO + * and may block there for many reasons (reserving space for example). + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -2066,13 +1920,44 @@ out: return err < 0 ? err : written; } -static ssize_t btrfs_file_write_iter(struct kiocb *iocb, - struct iov_iter *from) +static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t count; + ssize_t ret; + + btrfs_inode_lock(inode, 0); + count = encoded->len; + ret = generic_write_checks_count(iocb, &count); + if (ret == 0 && count != encoded->len) { + /* + * The write got truncated by generic_write_checks_count(). We + * can't do a partial encoded write. + */ + ret = -EFBIG; + } + if (ret || encoded->len == 0) + goto out; + + ret = btrfs_write_check(iocb, from, encoded->len); + if (ret < 0) + goto out; + + ret = btrfs_do_encoded_write(iocb, from, encoded); +out: + btrfs_inode_unlock(inode, 0); + return ret; +} + +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); - ssize_t num_written = 0; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + ssize_t num_written, num_sync; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -2082,22 +1967,30 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) + if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (sync) atomic_inc(&inode->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) + if (encoded) { + num_written = btrfs_encoded_write(iocb, from, encoded); + num_sync = encoded->len; + } else if (iocb->ki_flags & IOCB_DIRECT) { num_written = btrfs_direct_write(iocb, from); - else + num_sync = num_written; + } else { num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; + } btrfs_set_inode_last_sub_trans(inode); - if (num_written > 0) - num_written = generic_write_sync(iocb, num_written); + if (num_sync > 0) { + num_sync = generic_write_sync(iocb, num_sync); + if (num_sync < 0) + num_written = num_sync; + } if (sync) atomic_dec(&inode->sync_writers); @@ -2106,6 +1999,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return num_written; } +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + return btrfs_do_write_iter(iocb, from, NULL); +} + int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; @@ -2225,14 +2123,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * Always check for the full sync flag while holding the inode's lock, - * to avoid races with other tasks. The flag must be either set all the - * time during logging or always off all the time while logging. - */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); - - /* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging @@ -2257,6 +2147,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. + * We check the flag here after starting delalloc above, because when + * running delalloc the full sync flag may be set if we need to drop + * extra extent map ranges due to temporary memory allocation failures. + */ + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * @@ -2331,7 +2232,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent @@ -2346,27 +2247,65 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); + ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; @@ -2388,7 +2327,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); @@ -2435,7 +2374,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; - struct extent_map_tree *em_tree = &inode->extent_tree; struct btrfs_key key; int ret; @@ -2469,6 +2407,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } @@ -2485,13 +2424,14 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, + end - offset); if (ret) return ret; @@ -2500,8 +2440,8 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_drop_extent_map_range(inode, offset, end - 1, false); + btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; @@ -2514,16 +2454,10 @@ out: hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; - do { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - } while (ret == -EEXIST); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); if (ret) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); } return 0; @@ -2558,10 +2492,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) return ret; } -static int btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) { /* * For subpage case, if the range is not at page boundary, we could @@ -2575,40 +2509,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; while (1) { - struct btrfs_ordered_extent *ordered; - int ret; - truncate_pagecache_range(inode, lockstart, lockend); - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - lockend); - + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. + * We can't have ordered extents in the range, nor dirty/writeback + * pages, because we have locked the inode's VFS lock in exclusive + * mode, we have locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range and we have waited + * for any ordered extents in the range to complete. + * We can race with anyone reading pages from this range, so after + * locking the range check if we have pages in the range, and if + * we do, unlock the range and retry. */ - if ((!ordered || - (ordered->file_offset + ordered->num_bytes <= lockstart || - ordered->file_offset > lockend)) && - !filemap_range_has_page(inode->i_mapping, - page_lockstart, page_lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, + page_lockend)) break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, cached_state); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) - return ret; + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } - return 0; + + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, @@ -2732,7 +2655,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; /* * 1 - update the inode @@ -2754,7 +2677,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2838,6 +2762,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2854,7 +2797,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; @@ -2877,7 +2821,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; @@ -2945,8 +2889,9 @@ out: return ret; } -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; @@ -2963,11 +2908,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) - return ret; + goto out_only_mutex; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -2978,9 +2924,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); - lockend = round_down(offset + len, - btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; + ret = file_modified(file); + if (ret) + goto out_only_mutex; + + lockstart = round_up(offset, fs_info->sectorsize); + lockend = round_down(offset + len, fs_info->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -3055,10 +3004,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out_only_mutex; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { @@ -3074,14 +3020,15 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -3184,7 +3131,7 @@ enum { static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = inode->root->fs_info->sectorsize; struct extent_map *em; int ret; @@ -3214,14 +3161,12 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 sectorsize = fs_info->sectorsize; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; bool space_reserved = false; - inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { @@ -3351,23 +3296,21 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), offset + len, &alloc_hint); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -3400,8 +3343,11 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; + u64 data_space_needed = 0; + u64 data_space_reserved = 0; + u64 qgroup_reserved = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); + int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; /* Do not allow fallocate in ZONED mode */ @@ -3418,19 +3364,7 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return btrfs_punch_hole(inode, offset, len); - - /* - * Only trigger disk allocation, don't trigger qgroup reserve - * - * For qgroup space, it will be checked later. - */ - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; - } + return btrfs_punch_hole(file, offset, len); btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); @@ -3440,6 +3374,10 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } + ret = file_modified(file); + if (ret) + goto out; + /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but @@ -3464,8 +3402,12 @@ static long btrfs_fallocate(struct file *file, int mode, } /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. + * We have locked the inode at the VFS level (in exclusive mode) and we + * have locked the i_mmap_lock lock (in exclusive mode). Now before + * locking the file range, flush all dealloc in the range and wait for + * all ordered extents in the range to complete. After this we can lock + * the file range and, due to the previous locking we did, we know there + * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); @@ -3479,38 +3421,10 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - locked_end); - - if (ordered && - ordered->file_offset + ordered->num_bytes > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - ret = btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - if (ret) - goto out; - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); @@ -3527,48 +3441,64 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = add_falloc_range(&reserve_list, cur_offset, - last_byte - cur_offset); + const u64 range_len = last_byte - cur_offset; + + ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), - &data_reserved, cur_offset, - last_byte - cur_offset); + &data_reserved, cur_offset, range_len); if (ret < 0) { - cur_offset = last_byte; free_extent_map(em); break; } - } else { - /* - * Do not need to reserve unwritten extent for this - * range, free reserved data space first, otherwise - * it'll result in false ENOSPC error. - */ - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, cur_offset, - last_byte - cur_offset); + qgroup_reserved += range_len; + data_space_needed += range_len; } free_extent_map(em); cur_offset = last_byte; } + if (!ret && data_space_needed > 0) { + /* + * We are safe to reserve space here as we can't have delalloc + * in the range, see above. + */ + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + data_space_needed); + if (!ret) + data_space_reserved = data_space_needed; + } + /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { - if (!ret) + if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, i_blocksize(inode), offset + len, &alloc_hint); - else + /* + * btrfs_prealloc_file_range() releases space even + * if it returns an error. + */ + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, range->start, - range->len); + data_reserved, range->start, + range->len); + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, + range->start, range->len); + qgroup_reserved -= range->len; + } list_del(&range->list); kfree(range); } @@ -3581,35 +3511,290 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - /* Let go of our reservation. */ - if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, - cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } +/* + * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range + * that has unflushed and/or flushing delalloc. There might be other adjacent + * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps + * looping while it gets adjacent subranges, and merging them together. + */ +static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + const u64 len = end + 1 - start; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + u64 em_end; + u64 delalloc_len; + + /* + * Search the io tree first for EXTENT_DELALLOC. If we find any, it + * means we have delalloc (dirty pages) for which writeback has not + * started yet. + */ + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ + if (delalloc_len > 0) + *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + + /* + * Now also check if there's any extent map in the range that does not + * map to a hole or prealloc extent. We do this because: + * + * 1) When delalloc is flushed, the file range is locked, we clear the + * EXTENT_DELALLOC bit from the io tree and create an extent map for + * an allocated extent. So we might just have been called after + * delalloc is flushed and before the ordered extent completes and + * inserts the new file extent item in the subvolume's btree; + * + * 2) We may have an extent map created by flushing delalloc for a + * subrange that starts before the subrange we found marked with + * EXTENT_DELALLOC in the io tree. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + /* extent_map_end() returns a non-inclusive end offset. */ + em_end = em ? extent_map_end(em) : 0; + + /* + * If we have a hole/prealloc extent map, check the next one if this one + * ends before our range's end. + */ + if (em && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { + struct extent_map *next_em; + + read_lock(&em_tree->lock); + next_em = lookup_extent_mapping(em_tree, em_end, len - em_end); + read_unlock(&em_tree->lock); + + free_extent_map(em); + em_end = next_em ? extent_map_end(next_em) : 0; + em = next_em; + } + + if (em && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + em = NULL; + } + + /* + * No extent map or one for a hole or prealloc extent. Use the delalloc + * range we found in the io tree if we have one. + */ + if (!em) + return (delalloc_len > 0); + + /* + * We don't have any range as EXTENT_DELALLOC in the io tree, so the + * extent map is the only subrange representing delalloc. + */ + if (delalloc_len == 0) { + *delalloc_start_ret = em->start; + *delalloc_end_ret = min(end, em_end - 1); + free_extent_map(em); + return true; + } + + /* + * The extent map represents a delalloc range that starts before the + * delalloc range we found in the io tree. + */ + if (em->start < *delalloc_start_ret) { + *delalloc_start_ret = em->start; + /* + * If the ranges are adjacent, return a combined range. + * Otherwise return the extent map's range. + */ + if (em_end < *delalloc_start_ret) + *delalloc_end_ret = min(end, em_end - 1); + + free_extent_map(em); + return true; + } + + /* + * The extent map starts after the delalloc range we found in the io + * tree. If it's adjacent, return a combined range, otherwise return + * the range found in the io tree. + */ + if (*delalloc_end_ret + 1 == em->start) + *delalloc_end_ret = min(end, em_end - 1); + + free_extent_map(em); + return true; +} + +/* + * Check if there's delalloc in a given range. + * + * @inode: The inode. + * @start: The start offset of the range. It does not need to be + * sector size aligned. + * @end: The end offset (inclusive value) of the search range. + * It does not need to be sector size aligned. + * @delalloc_start_ret: Output argument, set to the start offset of the + * subrange found with delalloc (may not be sector size + * aligned). + * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) + * of the subrange found with delalloc. + * + * Returns true if a subrange with delalloc is found within the given range, and + * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and + * end offsets of the subrange. + */ +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); + u64 prev_delalloc_end = 0; + bool ret = false; + + while (cur_offset < end) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = find_delalloc_subrange(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + if (prev_delalloc_end == 0) { + /* First subrange found. */ + *delalloc_start_ret = max(delalloc_start, start); + *delalloc_end_ret = delalloc_end; + ret = true; + } else if (delalloc_start == prev_delalloc_end + 1) { + /* Subrange adjacent to the previous one, merge them. */ + *delalloc_end_ret = delalloc_end; + } else { + /* Subrange not adjacent to the previous one, exit. */ + break; + } + + prev_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + cond_resched(); + } + + return ret; +} + +/* + * Check if there's a hole or delalloc range in a range representing a hole (or + * prealloc extent) found in the inode's subvolume btree. + * + * @inode: The inode. + * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). + * @start: Start offset of the hole region. It does not need to be sector + * size aligned. + * @end: End offset (inclusive value) of the hole region. It does not + * need to be sector size aligned. + * @start_ret: Return parameter, used to set the start of the subrange in the + * hole that matches the search criteria (seek mode), if such + * subrange is found (return value of the function is true). + * The value returned here may not be sector size aligned. + * + * Returns true if a subrange matching the given seek mode is found, and if one + * is found, it updates @start_ret with the start of the subrange. + */ +static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + u64 start, u64 end, u64 *start_ret) +{ + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, start, end, + &delalloc_start, &delalloc_end); + if (delalloc && whence == SEEK_DATA) { + *start_ret = delalloc_start; + return true; + } + + if (delalloc && whence == SEEK_HOLE) { + /* + * We found delalloc but it starts after out start offset. So we + * have a hole between our start offset and the delalloc start. + */ + if (start < delalloc_start) { + *start_ret = start; + return true; + } + /* + * Delalloc range starts at our start offset. + * If the delalloc range's length is smaller than our range, + * then it means we have a hole that starts where the delalloc + * subrange ends. + */ + if (delalloc_end < end) { + *start_ret = delalloc_end + 1; + return true; + } + + /* There's delalloc for the whole range. */ + return false; + } + + if (!delalloc && whence == SEEK_HOLE) { + *start_ret = start; + return true; + } + + /* + * No delalloc in the range and we are seeking for data. The caller has + * to iterate to the next extent item in the subvolume btree. + */ + return false; +} + static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, int whence) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->vfs_inode.i_size; + const loff_t i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + u64 last_extent_end; u64 lockstart; u64 lockend; u64 start; - u64 len; - int ret = 0; + int ret; + bool found = false; if (i_size == 0 || offset >= i_size) return -ENXIO; /* + * Quick path. If the inode has no prealloc extents and its number of + * bytes used matches its i_size, then it can not have holes. + */ + if (whence == SEEK_HOLE && + !(inode->flags & BTRFS_INODE_PREALLOC) && + inode_get_bytes(&inode->vfs_inode) == i_size) + return i_size; + + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ @@ -3620,45 +3805,164 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; - len = lockend - lockstart + 1; - lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + last_extent_end = lockstart; + + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } while (start < i_size) { - em = btrfs_get_extent_fiemap(inode, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - em = NULL; - break; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *extent; + u64 extent_end; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + leaf = path->nodes[0]; } - if (whence == SEEK_HOLE && - (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) - break; - else if (whence == SEEK_DATA && - (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; - start = em->start + em->len; - free_extent_map(em); - em = NULL; + extent_end = btrfs_file_extent_end(path); + + /* + * In the first iteration we may have a slot that points to an + * extent that ends before our start offset, so skip it. + */ + if (extent_end <= start) { + path->slots[0]++; + continue; + } + + /* We have an implicit hole, NO_HOLES feature is likely set. */ + if (last_extent_end < key.offset) { + u64 search_start = last_extent_end; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + search_start, + key.offset - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the extent. + */ + } + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || + btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_PREALLOC) { + /* + * Explicit hole or prealloc extent, search for delalloc. + * A prealloc extent is treated like a hole. + */ + u64 search_start = key.offset; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + search_start, + extent_end - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the next + * extent item. + */ + } else { + /* + * Found a regular or inline extent. + * If we are seeking for data, adjust the start offset + * and stop, we're done. + */ + if (whence == SEEK_DATA) { + start = max_t(u64, key.offset, offset); + found = true; + break; + } + /* + * Else, we are seeking for a hole, check the next file + * extent item. + */ + } + + start = extent_end; + last_extent_end = extent_end; + path->slots[0]++; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } cond_resched(); } - free_extent_map(em); - unlock_extent_cached(&inode->io_tree, lockstart, lockend, - &cached_state); - if (ret) { - offset = ret; - } else { - if (whence == SEEK_DATA && start >= i_size) - offset = -ENXIO; - else - offset = min_t(loff_t, start, i_size); + + /* We have an implicit hole from the last extent found up to i_size. */ + if (!found && start < i_size) { + found = find_desired_extent_in_hole(inode, whence, start, + i_size - 1, &start); + if (!found) + start = i_size; } - return offset; +out: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_free_path(path); + + if (ret < 0) + return ret; + + if (whence == SEEK_DATA && start >= i_size) + return -ENXIO; + + return min_t(loff_t, start, i_size); } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) @@ -3686,7 +3990,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; ret = fsverity_file_open(inode, filp); if (ret) @@ -3746,8 +4050,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, read); + ret = btrfs_dio_read(iocb, to, read); to->nofault = false; pagefault_enable(); @@ -3804,6 +4107,7 @@ const struct file_operations btrfs_file_operations = { .mmap = btrfs_file_mmap, .open = btrfs_file_open, .release = btrfs_release_file, + .get_unmapped_area = thp_get_unmapped_area, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01a408db5683..f4023651dd68 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -48,6 +48,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} + static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) @@ -126,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!block_group->iref) { + if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) block_group->inode = igrab(inode); - block_group->iref = 1; - } spin_unlock(&block_group->lock); return inode; @@ -241,8 +257,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { block_group->inode = NULL; spin_unlock(&block_group->lock); iput(inode); @@ -333,8 +348,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't @@ -345,7 +360,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); + unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -465,7 +480,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, @@ -693,6 +708,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bitmaps = max_t(u64, max_bitmaps, 1); + if (ctl->total_bitmaps > max_bitmaps) + btrfs_err(block_group->fs_info, +"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", + block_group->start, block_group->length, + ctl->total_bitmaps, ctl->unit, max_bitmaps, + bytes_per_bg); ASSERT(ctl->total_bitmaps <= max_bitmaps); /* @@ -875,7 +896,10 @@ out: return ret; free_cache: io_ctl_drop_pages(&io_ctl); + + spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); + spin_unlock(&ctl->tree_lock); goto out; } @@ -914,6 +938,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, return ret; } +static struct lock_class_key btrfs_free_space_inode_key; + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -983,6 +1009,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); + /* + * Reinitialize the class of struct inode's mapping->invalidate_lock for + * free space inodes to prevent false positives related to locks for normal + * inodes. + */ + lockdep_set_class(&(&inode->i_data)->invalidate_lock, + &btrfs_free_space_inode_key); + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); @@ -1001,7 +1035,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group) if (ret == 0) ret = 1; } else { + /* + * We need to call the _locked variant so we don't try to update + * the discard counters. + */ + spin_lock(&tmp_ctl.tree_lock); __btrfs_remove_free_space_cache(&tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -1123,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1135,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, 0, - 0, NULL); + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1232,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); return ret; } @@ -1252,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1378,8 +1418,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out_unlock; - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1434,8 +1474,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2630,16 +2670,19 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { - struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + int bg_reclaim_threshold = 0; bool initial = (size == block_group->length); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + spin_lock(&ctl->tree_lock); if (!used) to_free = size; @@ -2857,7 +2900,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (btrfs_is_zoned(fs_info)) { btrfs_info(fs_info, "free space %llu active %d", block_group->zone_capacity - block_group->alloc_offset, - block_group->zone_is_active); + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)); return; } @@ -2961,34 +3005,6 @@ static void __btrfs_return_cluster_to_free_space( btrfs_put_block_group(block_group); } -static void __btrfs_remove_free_space_cache_locked( - struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *node; - - while ((node = rb_last(&ctl->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - if (!info->bitmap) { - unlink_free_space(ctl, info, true); - kmem_cache_free(btrfs_free_space_cachep, info); - } else { - free_bitmap(ctl, info); - } - - cond_resched_lock(&ctl->tree_lock); - } -} - -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) -{ - spin_lock(&ctl->tree_lock); - __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->block_group) - btrfs_discard_update_discardable(ctl->block_group); - spin_unlock(&ctl->tree_lock); -} - void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3006,7 +3022,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } - __btrfs_remove_free_space_cache_locked(ctl); + __btrfs_remove_free_space_cache(ctl); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); @@ -3533,7 +3549,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; @@ -3988,7 +4005,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4018,7 +4035,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4040,7 +4057,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4069,7 +4086,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "cleaning free space cache v1"); - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 15591b299895..6d419ba53e95 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 655aad0f9e1c..367bcfcf68f5 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root( .offset = 0, }; + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) + key.offset = block_group->global_root_id; return btrfs_global_root(block_group->fs_info, &key); } @@ -1176,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; } - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); @@ -1451,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - offset = key.objectid; while (offset < key.objectid + key.offset) { bit = free_space_test_bit(block_group, path, offset); @@ -1488,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; @@ -1529,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - total_found += add_new_free_space(block_group, key.objectid, key.objectid + key.offset); if (total_found > CACHING_CTL_WAKE_UP) { @@ -1550,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b2403b6127f..0e516aefbf51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,6 +64,39 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + bool data_space_reserved; + bool nocow_done; +}; + +struct btrfs_dio_private { + struct inode *inode; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; + /* Used for bio::bi_size */ + u32 bytes; + + /* + * References to this structure. There is one reference per in-flight + * bio plus one while we're still setting up. + */ + refcount_t refs; + + /* Array of checksums */ + u8 *csums; + + /* This must be last */ + struct bio bio; +}; + +static struct bio_set btrfs_dio_bioset; + +struct btrfs_rename_ctx { + /* Output field. Stores the index number of the old directory entry. */ + u64 index; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -81,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -162,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -179,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -190,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -198,34 +230,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) + struct btrfs_new_inode_args *args) { int err; - err = btrfs_init_acl(trans, inode, dir); - if (!err) - err = btrfs_xattr_security_init(trans, inode, dir, qstr); - return err; + if (args->default_acl) { + err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ACL_TYPE_DEFAULT); + if (err) + return err; + } + if (args->acl) { + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (err) + return err; + } + if (!args->default_acl && !args->acl) + cache_no_acl(args->inode); + return btrfs_xattr_security_init(trans, args->inode, args->dir, + &args->dentry->d_name); } /* @@ -234,12 +279,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, bool extent_inserted, - struct btrfs_root *root, struct inode *inode, - u64 start, size_t size, size_t compressed_size, + struct btrfs_path *path, + struct btrfs_inode *inode, bool extent_inserted, + size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { + struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; @@ -247,7 +294,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; - unsigned long offset; + u64 i_size; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); @@ -259,8 +306,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; size_t datasize; - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = start; + key.objectid = btrfs_ino(inode); + key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); @@ -287,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -298,13 +345,11 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->i_mapping, - start >> PAGE_SHIFT); + page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); - offset = offset_in_page(start); - write_extent_buffer(leaf, kaddr + offset, ptr, size); - kunmap_atomic(kaddr); + kaddr = kmap_local_page(page); + write_extent_buffer(leaf, kaddr, ptr, size); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); @@ -314,21 +359,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * We align size to sectorsize for inline extents just for simplicity * sake. */ - size = ALIGN(size, root->fs_info->sectorsize); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + ret = btrfs_inode_set_file_extent_range(inode, 0, + ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* - * we're an inline extent, so nobody can - * extend the file past i_size without locking - * a page we already have locked. + * We're an inline extent, so nobody can extend the file past i_size + * without locking a page we already have locked. * - * We must do any isize and inode updates - * before we unlock the pages. Otherwise we - * could end up racing with unlink. + * We must do any i_size and inode updates before we unlock the pages. + * Otherwise we could end up racing with unlink. */ - BTRFS_I(inode)->disk_i_size = inode->i_size; + i_size = i_size_read(&inode->vfs_inode); + if (update_i_size && size > i_size) { + i_size_write(&inode->vfs_inode, size); + i_size = size; + } + inode->disk_i_size = i_size; + fail: return ret; } @@ -339,35 +388,31 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, - u64 end, size_t compressed_size, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + size_t compressed_size, int compress_type, - struct page **compressed_pages) + struct page **compressed_pages, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(&inode->vfs_inode); - u64 actual_end = min(end + 1, isize); - u64 inline_len = actual_end - start; - u64 aligned_end = ALIGN(end, fs_info->sectorsize); - u64 data_len = inline_len; + u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; - if (compressed_size) - data_len = compressed_size; - - if (start > 0 || - actual_end > fs_info->sectorsize || + /* + * We can create an inline extent if it ends at or beyond the current + * i_size, is no larger than a sector (decompressed), and the (possibly + * compressed) data fits in a leaf and the configured maximum inline + * size. + */ + if (size < i_size_read(&inode->vfs_inode) || + size > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - (!compressed_size && - (actual_end & (fs_info->sectorsize - 1)) == 0) || - end + 1 < isize || - data_len > fs_info->max_inline) { + data_len > fs_info->max_inline) return 1; - } path = btrfs_alloc_path(); if (!path) @@ -381,30 +426,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, trans->block_rsv = &inode->block_rsv; drop_args.path = path; - drop_args.start = start; - drop_args.end = aligned_end; + drop_args.start = 0; + drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; - - if (compressed_size && compressed_pages) - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - compressed_size); - else - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( - inline_len); - + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - if (isize > actual_end) - inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, drop_args.extent_inserted, - root, &inode->vfs_inode, start, - inline_len, compressed_size, - compress_type, compressed_pages); + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, + size, compressed_size, compress_type, + compressed_pages, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -413,7 +448,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -423,7 +458,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -452,7 +487,7 @@ struct async_chunk { struct page *locked_page; u64 start; u64 end; - unsigned int write_flags; + blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; @@ -486,17 +521,6 @@ static noinline int add_async_extent(struct async_chunk *cow, } /* - * Check if the inode has flags compatible with compression - */ -static inline bool inode_can_compress(struct btrfs_inode *inode) -{ - if (inode->flags & BTRFS_INODE_NODATACOW || - inode->flags & BTRFS_INODE_NODATASUM) - return false; - return true; -} - -/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ @@ -505,7 +529,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, { struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(inode)) { + if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); @@ -538,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -560,12 +584,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, } static inline void inode_should_defrag(struct btrfs_inode *inode, - u64 start, u64 end, u64 num_bytes, u64 small_write) + u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); + btrfs_add_inode_defrag(NULL, inode, small_write); } /* @@ -624,7 +648,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED / PAGE_SIZE); @@ -657,8 +680,8 @@ again: * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } @@ -735,14 +758,15 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 0, BTRFS_COMPRESS_NONE, - NULL); + NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, total_compressed, - compress_type, pages); + compress_type, pages, + false); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | @@ -898,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -943,7 +977,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } - lock_extent(io_tree, start, end); + lock_extent(io_tree, start, end, NULL); /* We have fall back to uncompressed write */ if (!async_extent->pages) @@ -981,13 +1015,16 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ - ins.objectid, /* disk_bytenr */ - async_extent->ram_size, /* num_bytes */ - ins.offset, /* disk_num_bytes */ - async_extent->compress_type); + ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + async_extent->ram_size, /* ram_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* disk_num_bytes */ + 0, /* offset */ + 1 << BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_map_range(inode, start, end, false); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1003,7 +1040,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, - async_chunk->blkcg_css)) { + async_chunk->blkcg_css, true)) { const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1108,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1130,7 +1191,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } @@ -1152,9 +1212,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), + end + 1); + /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the @@ -1191,7 +1254,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1234,9 +1296,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, - BTRFS_ORDERED_REGULAR); + ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, + ins.objectid, cur_alloc_size, 0, + 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); if (ret) goto out_drop_extent_cache; @@ -1255,8 +1318,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(inode, start, - start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + ram_size - 1, + false); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1296,23 +1360,67 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + /* + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1322,12 +1430,19 @@ out_unlock: page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* @@ -1407,9 +1522,9 @@ static int cow_file_range_async(struct btrfs_inode *inode, int i; bool should_compress; unsigned nofs_flag; - const unsigned int write_flags = wbc_to_write_flags(wbc); + const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, NULL); if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { @@ -1510,26 +1625,48 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes) + u64 bytenr, u64 num_bytes, bool nowait) { struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; @@ -1537,7 +1674,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, LIST_HEAD(list); ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0); + bytenr + num_bytes - 1, &list, 0, + nowait); if (ret == 0 && list_empty(&list)) return 0; @@ -1610,11 +1748,148 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - 0, 0, NULL); + NULL); } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); +} + +struct can_nocow_file_extent_args { + /* Input fields. */ + + /* Start file offset of the range we want to NOCOW. */ + u64 start; + /* End file offset (inclusive) of the range we want to NOCOW. */ + u64 end; + bool writeback_path; + bool strict; + /* + * Free the path passed to can_nocow_file_extent() once it's not needed + * anymore. + */ + bool free_path; + + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ + + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + /* Number of bytes that can be written to in NOCOW mode. */ + u64 num_bytes; +}; + +/* + * Check if we can NOCOW the file extent that the path points to. + * This function may return with the path released, so the caller should check + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. + * + * Returns: < 0 on error + * 0 if we can not NOCOW + * 1 if we can NOCOW + */ +static int can_nocow_file_extent(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_inode *inode, + struct can_nocow_file_extent_args *args) +{ + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 extent_type; + int can_nocow = 0; + int ret = 0; + bool nowait = path->nowait; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + goto out; + + /* Can't access these fields unless we know it's not an inline extent. */ + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + extent_type == BTRFS_FILE_EXTENT_REG) + goto out; + + /* + * If the extent was created before the generation where the last snapshot + * for its subvolume was created, then this implies the extent is shared, + * hence we must COW. + */ + if (!args->strict && + btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + /* An explicit hole, must COW. */ + if (args->disk_bytenr == 0) + goto out; + + /* Compressed/encrypted/encoded extents must be COWed. */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + extent_end = btrfs_file_extent_end(path); + + /* + * The following checks can be expensive, as they need to take other + * locks and do btree or rbtree searches, so release the path to avoid + * blocking other tasks for too long. + */ + btrfs_release_path(path); + + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), + key->offset - args->extent_offset, + args->disk_bytenr, false, path); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + if (args->free_path) { + /* + * We don't need the path anymore, plus through the + * csum_exist_in_range() call below we will end up allocating + * another path. So free the path to avoid unnecessary extra + * memory usage. + */ + btrfs_free_path(path); + path = NULL; + } + + /* If there are pending snapshots for this root, we must COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + + args->disk_bytenr += args->extent_offset; + args->disk_bytenr += args->start - key->offset; + args->num_bytes = min(args->end + 1, extent_end) - args->start; + + /* + * Force COW if csums exist in the range. This ensures that csums for a + * given extent are either valid or do not exist. + */ + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, + nowait); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + can_nocow = 1; + out: + if (args->free_path && path) + btrfs_free_path(path); + + return ret < 0 ? ret : can_nocow; } /* @@ -1637,11 +1912,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(inode); u64 ino = btrfs_ino(inode); + struct btrfs_block_group *bg; bool nocow = false; - u64 disk_bytenr = 0; - const bool force = inode->flags & BTRFS_INODE_NODATACOW; + struct can_nocow_file_extent_args nocow_args = { 0 }; path = btrfs_alloc_path(); if (!path) { @@ -1654,15 +1928,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, return -ENOMEM; } + nocow_args.end = end; + nocow_args.writeback_path = true; + while (1) { struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; - u64 extent_offset; - u64 num_bytes = 0; - u64 disk_num_bytes; u64 ram_bytes; + u64 nocow_end; int extent_type; nocow = false; @@ -1738,116 +2013,38 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - + /* If this is triggered then we have a memory corruption. */ + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { + ret = -EUCLEAN; + goto error; + } ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - /* - * If the extent we got ends before our current offset, - * skip to the next extent. - */ - if (extent_end <= cur_offset) { - path->slots[0]++; - goto next_slot; - } - /* Skip holes */ - if (disk_bytenr == 0) - goto out_check; - /* Skip compressed/encrypted/encoded extents */ - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out_check; - /* - * If extent is created before the last volume's snapshot - * this implies the extent is shared, hence we can't do - * nocow. This is the same check as in - * btrfs_cross_ref_exist but without calling - * btrfs_search_slot. - */ - if (!freespace_inode && - btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out_check; - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) - goto out_check; + extent_end = btrfs_file_extent_end(path); - /* - * The following checks can be expensive, as they need to - * take other locks and do btree or rbtree searches, so - * release the path to avoid blocking other tasks for too - * long. - */ - btrfs_release_path(path); + /* + * If the extent we got ends before our current offset, skip to + * the next extent. + */ + if (extent_end <= cur_offset) { + path->slots[0]++; + goto next_slot; + } - ret = btrfs_cross_ref_exist(root, ino, - found_key.offset - - extent_offset, disk_bytenr, false); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } + nocow_args.start = cur_offset; + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } else if (ret == 0) { + goto out_check; + } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - disk_bytenr += extent_offset; - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; - /* - * If there are pending snapshots for this root, we - * fall into common COW way - */ - if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) - goto out_check; - /* - * force cow if csum exists in the range. - * this ensure that csum for a given extent are - * either valid or do not exist. - */ - ret = csum_exist_in_range(fs_info, disk_bytenr, - num_bytes); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - /* If the extent's block group is RO, we must COW */ - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) - goto out_check; + ret = 0; + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (bg) nocow = true; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + ram_bytes; - extent_end = ALIGN(extent_end, fs_info->sectorsize); - /* Skip extents outside of our requested range */ - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - } else { - /* If this triggers then we have a memory corruption */ - BUG(); - } out_check: /* * If nocow is false then record the beginning of the range @@ -1879,15 +2076,17 @@ out_check: cow_start = (u64)-1; } + nocow_end = cur_offset + nocow_args.num_bytes - 1; + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 orig_start = found_key.offset - extent_offset; + u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, num_bytes, + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, orig_start, - disk_bytenr, /* block_start */ - num_bytes, /* block_len */ - disk_num_bytes, /* orig_block_len */ + nocow_args.disk_bytenr, /* block_start */ + nocow_args.num_bytes, /* block_len */ + nocow_args.disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { @@ -1895,28 +2094,35 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_PREALLOC); + ret = btrfs_add_ordered_extent(inode, + cur_offset, nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, + 1 << BTRFS_ORDERED_PREALLOC, + BTRFS_COMPRESS_NONE); if (ret) { - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + num_bytes - 1, - 0); + btrfs_drop_extent_map_range(inode, cur_offset, + nocow_end, false); goto error; } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, - disk_bytenr, num_bytes, - num_bytes, - BTRFS_ORDERED_NOCOW); + nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, + 0, + 1 << BTRFS_ORDERED_NOCOW, + BTRFS_COMPRESS_NONE); if (ret) goto error; } - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - nocow = false; + if (nocow) { + btrfs_dec_nocow_writers(bg); + nocow = false; + } if (btrfs_is_data_reloc_root(root)) /* @@ -1925,10 +2131,9 @@ out_check: * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(inode, cur_offset, - num_bytes); + nocow_args.num_bytes); - extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1961,7 +2166,7 @@ out_check: error: if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); + btrfs_dec_nocow_writers(bg); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, @@ -2012,18 +2217,17 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ - ASSERT(!zoned || - (zoned && btrfs_is_data_reloc_root(inode->root))); + ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!btrfs_inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2039,6 +2243,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2046,7 +2251,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2055,10 +2260,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2075,6 +2280,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2088,7 +2294,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); @@ -2114,10 +2320,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2182,21 +2388,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2211,7 +2417,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2220,7 +2426,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2233,14 +2439,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2251,7 +2457,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2264,7 +2470,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2274,7 +2480,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2289,11 +2495,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } @@ -2310,7 +2516,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } /* @@ -2345,7 +2551,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ASSERT(pre + post < len); - lock_extent(&inode->io_tree, start, start + len - 1); + lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { @@ -2419,7 +2625,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); out: free_extent_map(split_pre); free_extent_map(split_mid); @@ -2488,94 +2694,75 @@ out: return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) - +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); - if (ret) - goto out; + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; + /* + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) + return; - if (bio_flags & EXTENT_BIO_COMPRESSED) { - ret = btrfs_submit_compressed_read(inode, bio, - mirror_num, - bio_flags); - goto out; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, - 0, btrfs_submit_bio_start); - goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); - if (ret) - goto out; } + btrfs_submit_bio(fs_info, bio, mirror_num); +} -mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; -out: + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; + } + + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; + + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; } - return ret; + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -2631,8 +2818,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, 0, NULL, cached_state, - GFP_NOFS, NULL); + EXTENT_DELALLOC_NEW, cached_state, + GFP_NOFS); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2744,7 +2931,7 @@ again: if (ret) goto out_page; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) @@ -2752,8 +2939,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); @@ -2779,8 +2966,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2870,6 +3056,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; @@ -2944,7 +3131,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), - file_pos, qgroup_reserved, &ins); + file_pos - offset, + qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2970,20 +3158,22 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; - u64 logical_len; bool update_inode_bytes; + u64 num_bytes = oe->num_bytes; + u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - logical_len = oe->truncated_len; - else - logical_len = oe->num_bytes; - btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); - btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ @@ -2994,6 +3184,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), @@ -3006,7 +3197,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3028,10 +3219,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); + if (!freespace_inode) + btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -3076,7 +3270,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } clear_bits |= EXTENT_LOCKED; - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3098,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3130,7 +3326,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - 0, 0, &cached_state); + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); @@ -3141,7 +3337,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); if (trans) @@ -3166,8 +3361,8 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(inode, unwritten_start, end, 0); + /* Drop extent maps for the part of the extent we didn't write. */ + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); /* * If the ordered extent had an IOERR or something else went @@ -3212,71 +3407,82 @@ out: return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; +} + +static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); - - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - - kunmap_atomic(kaddr); return 0; + zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memset(kaddr + pgoff, 1, len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memzero_page(page, pgoff, len); return -EIO; } @@ -3304,11 +3510,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read @@ -3341,8 +3542,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -4062,7 +4262,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const char *name, int name_len, + struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4118,15 +4319,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto err; } skip_backref: + if (rename_ctx) + rename_ctx->index = index; + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unnecessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); + } /* * If we have a pending delayed iput we could end up with the final iput @@ -4146,8 +4359,9 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4158,7 +4372,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4184,8 +4398,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the dir index * 1 for the inode ref * 1 for the inode + * 1 for the parent inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 6); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4308,7 +4523,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -4460,6 +4676,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@ -4565,14 +4788,21 @@ out_up_write: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } return btrfs_delete_subvolume(dir, dentry); + } trans = __unlink_start_trans(dir); if (IS_ERR(trans)) @@ -4611,7 +4841,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); + btrfs_btree_balance_dirty(fs_info); return err; } @@ -4655,16 +4885,16 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, - blocksize); + blocksize, false); if (ret < 0) { - if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -4685,7 +4915,7 @@ again: goto out_unlock; if (!PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping) { unlock_page(page); @@ -4699,12 +4929,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, block_start, block_end, &cached_state); + lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(ordered, 1); @@ -4714,13 +4943,12 @@ again: clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } @@ -4733,16 +4961,15 @@ again: else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); - unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); + EXTENT_NORESERVE, NULL, GFP_NOFS); out_unlock: if (ret) { @@ -4799,8 +5026,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); if (ret) { btrfs_abort_transaction(trans, ret); } else { @@ -4824,7 +5050,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; @@ -4872,12 +5097,12 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (err) break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &inode->runtime_flags); + btrfs_drop_extent_map_range(inode, cur_offset, + cur_offset + hole_size - 1, + false); + btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; @@ -4891,16 +5116,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; - while (1) { - write_lock(&em_tree->lock); - err = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - if (err != -EEXIST) - break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + - hole_size - 1, 0); - } + err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { err = btrfs_inode_set_file_extent_range(inode, @@ -4916,7 +5132,7 @@ next: break; } free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); + unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return err; } @@ -4937,9 +5153,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -5046,43 +5263,27 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } /* - * While truncating the inode pages during eviction, we get the VFS calling - * btrfs_invalidatepage() against each page of the inode. This is slow because - * the calls to btrfs_invalidatepage() result in a huge amount of calls to - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting - * extent_state structures over and over, wasting lots of time. + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. * - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all - * those expensive operations on a per page basis and do only the ordered io - * finishing, while we release here the extent_map and extent_state structures, - * without the excessive merging and splitting. + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); - write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { - struct extent_map *em; - - node = rb_first_cached(&map_tree->map); - em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - remove_extent_mapping(map_tree, em); - free_extent_map(em); - if (need_resched()) { - write_unlock(&map_tree->lock); - cond_resched(); - write_lock(&map_tree->lock); - } - } - write_unlock(&map_tree->lock); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * Keep looping until we have no more ranges in the io tree. @@ -5115,13 +5316,13 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. - * (Refer to comment in btrfs_invalidatepage, case 2) + * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ @@ -5130,8 +5331,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, &cached_state); cond_resched(); @@ -5246,7 +5446,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5484,6 +5684,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); + + if (args->root && args->root == args->root->fs_info->tree_root && + args->ino != BTRFS_BTREE_INODE_OBJECTID) + set_bit(BTRFS_INODE_FREE_SPACE_INODE, + &BTRFS_I(inode)->runtime_flags); return 0; } @@ -5584,21 +5789,17 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); +static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); +static_assert(BTRFS_FT_DIR == FT_DIR); +static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); +static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); +static_assert(BTRFS_FT_FIFO == FT_FIFO); +static_assert(BTRFS_FT_SOCK == FT_SOCK); +static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); + static inline u8 btrfs_inode_type(struct inode *inode) { - /* - * Compile-time asserts that generic FT_* types still match - * BTRFS_FT_* types - */ - BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); - BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); - BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); - BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); - BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); - BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); - BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); - BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); - return fs_umode_to_ftype(inode->i_mode); } @@ -5642,14 +5843,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); @@ -5755,8 +5956,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct list_head ins_list; struct list_head del_list; int ret; - struct extent_buffer *leaf; - int slot; char *name_ptr; int name_len; int entries = 0; @@ -5783,35 +5982,19 @@ again: key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); + struct extent_buffer *leaf = path->nodes[0]; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) - goto next; + continue; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) - goto next; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + continue; + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -5838,9 +6021,11 @@ again: entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; -next: - path->slots[0]++; } + /* Catch error encountered during iteration */ + if (ret < 0) + goto err; + btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); @@ -5971,14 +6156,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) goto out; ret = 0; - /* - * MAGIC NUMBER EXPLANATION: - * since we search a directory based on f_pos we have to start at 2 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody - * else has to start at 2 - */ if (path->slots[0] == 0) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -5989,7 +6168,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = 2; + inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } @@ -6034,6 +6213,57 @@ static int btrfs_insert_inode_locked(struct inode *inode) btrfs_find_actor, &args); } +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + int ret; + + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); + if (ret) + return ret; + + /* 1 to add inode item */ + *trans_num_items = 1; + /* 1 to add compression property */ + if (BTRFS_I(dir)->prop_compress) + (*trans_num_items)++; + /* 1 to add default ACL xattr */ + if (args->default_acl) + (*trans_num_items)++; + /* 1 to add access ACL xattr */ + if (args->acl) + (*trans_num_items)++; +#ifdef CONFIG_SECURITY + /* 1 to add LSM xattr */ + if (dir->i_security) + (*trans_num_items)++; +#endif + if (args->orphan) { + /* 1 to add orphan item */ + (*trans_num_items)++; + } else { + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + * + * No need for 1 unit for the inode ref item because it is + * inserted in a batch together with the inode item at + * btrfs_create_new_inode(). + */ + *trans_num_items += 3; + } + return 0; +} + +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) +{ + posix_acl_release(args->acl); + posix_acl_release(args->default_acl); +} + /* * Inherit flags from the parent inode. * @@ -6043,9 +6273,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { unsigned int flags; - if (!dir) - return; - flags = BTRFS_I(dir)->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { @@ -6065,82 +6292,92 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) btrfs_sync_inode_flags_to_i_flags(inode); } -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - const char *name, int name_len, - u64 ref_objectid, u64 objectid, - umode_t mode, u64 *index) +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct inode *dir = args->dir; + struct inode *inode = args->inode; + const char *name = args->orphan ? NULL : args->dentry->d_name.name; + int name_len = args->orphan ? 0 : args->dentry->d_name.len; + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; + u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; - unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); - - nofs_flag = memalloc_nofs_save(); - inode = new_inode(fs_info->sb); - memalloc_nofs_restore(nofs_flag); - if (!inode) { - btrfs_free_path(path); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; - /* - * O_TMPFILE, set link count to 0, so that after this point, - * we fill in an inode item with the correct link count. - */ - if (!name) - set_nlink(inode, 0); + if (!args->subvol) + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); + root = BTRFS_I(inode)->root; - /* - * we have to initialize this early, so we can reclaim the inode - * number if we fail afterwards in this function. - */ + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) + goto out; inode->i_ino = objectid; - if (dir && name) { + if (args->orphan) { + /* + * O_TMPFILE, set link count to 0, so that after this point, we + * fill in an inode item with the correct link count. + */ + set_nlink(inode, 0); + } else { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(BTRFS_I(dir), index); - if (ret) { - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); - } - } else if (dir) { - *index = 0; + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); + if (ret) + goto out; } - /* - * index_cnt is ignored for everything but a dir, - * btrfs_set_inode_index_count has an explanation for the magic - * number - */ - BTRFS_I(inode)->index_cnt = 2; - BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = btrfs_grab_root(root); + /* index_cnt is ignored for everything but a dir. */ + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* + * Subvolumes don't inherit flags from their parent directory. + * Originally this was probably by accident, but we probably can't + * change it now without compatibility issues. + */ + if (!args->subvol) + btrfs_inherit_iflags(inode, dir); + + if (S_ISREG(inode->i_mode)) { + if (btrfs_test_opt(fs_info, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(fs_info, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) { + if (!args->orphan) + BTRFS_I(dir)->index_cnt--; + goto out; + } + + /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; @@ -6148,7 +6385,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); - if (name) { + if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will @@ -6157,64 +6394,101 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; - key[1].offset = ref_objectid; - - sizes[1] = name_len + sizeof(*ref); - } - - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - - ret = btrfs_insert_inode_locked(inode); - if (ret < 0) { - iput(inode); - goto fail; + if (args->subvol) { + key[1].offset = objectid; + sizes[1] = 2 + sizeof(*ref); + } else { + key[1].offset = btrfs_ino(BTRFS_I(dir)); + sizes[1] = name_len + sizeof(*ref); + } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; - batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); - batch.nr = name ? 2 : 1; + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); + batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) - goto fail_unlock; - - inode_init_owner(mnt_userns, inode, dir, mode); - inode_set_bytes(inode, 0); + if (ret != 0) { + btrfs_abort_transaction(trans, ret); + goto discard; + } inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; + /* + * We're going to fill the inode item now, so at this point the inode + * must be fully initialized. + */ + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - if (name) { + if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (args->subvol) { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); + write_extent_buffer(path->nodes[0], "..", ptr, 2); + } else { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, + BTRFS_I(inode)->dir_index); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } } btrfs_mark_buffer_dirty(path->nodes[0]); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ btrfs_free_path(path); + path = NULL; - btrfs_inherit_iflags(inode, dir); + if (args->subvol) { + struct inode *parent; - if (S_ISREG(mode)) { - if (btrfs_test_opt(fs_info, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(fs_info, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + /* + * Subvolumes inherit properties from their parent subvolume, + * not the directory they were created in. + */ + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_I(dir)->root); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + } else { + ret = btrfs_inode_inherit_props(trans, inode, parent); + iput(parent); + } + } else { + ret = btrfs_inode_inherit_props(trans, inode, dir); + } + if (ret) { + btrfs_err(fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, + ret); + } + + /* + * Subvolumes don't inherit ACLs or get passed to the LSM. This is + * probably a bug. + */ + if (!args->subvol) { + ret = btrfs_init_inode_security(trans, args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } inode_tree_add(inode); @@ -6224,21 +6498,29 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); - ret = btrfs_inode_inherit_props(trans, inode, dir); - if (ret) - btrfs_err(fs_info, - "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); + if (args->orphan) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + } else { + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 0, BTRFS_I(inode)->dir_index); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } - return inode; + return 0; -fail_unlock: +discard: + /* + * discard_new_inode() calls iput(), but the caller owns the reference + * to the inode. + */ + ihold(inode); discard_new_inode(inode); -fail: - if (dir && name) - BTRFS_I(dir)->index_cnt--; +out: btrfs_free_path(path); - return ERR_PTR(ret); + return ret; } /* @@ -6330,147 +6612,71 @@ fail_dir_item: return ret; } -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct dentry *dentry, - struct btrfs_inode *inode, int backref, u64 index) -{ - int err = btrfs_add_link(trans, dir, inode, - dentry->d_name.name, dentry->d_name.len, - backref, index); - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) +static int btrfs_create_common(struct inode *dir, struct dentry *dentry, + struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .inode = inode, + }; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; int err; - u64 objectid; - u64 index = 0; - - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - err = btrfs_get_free_objectid(root, &objectid); + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - btrfs_update_inode(trans, root, BTRFS_I(inode)); - d_instantiate_new(dentry, inode); + err = btrfs_create_new_inode(trans, &new_inode_args); + if (!err) + d_instantiate_new(dentry, inode); -out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int err; - u64 objectid; - u64 index = 0; + struct inode *inode; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + return btrfs_create_common(dir, dentry, inode); +} - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_unlock; +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; - } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - d_instantiate_new(dentry, inode); - -out_unlock: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6516,8 +6722,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 1, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, dentry->d_name.len, 1, index); if (err) { drop_inode = 1; @@ -6537,7 +6743,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: @@ -6554,66 +6760,15 @@ fail: static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - u64 objectid = 0; - u64 index = 0; - - /* - * 2 items for inode and ref - * 2 items for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_fail; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFDIR | mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_fail; - } + struct inode *inode; - /* these must be set before we unlock the inode */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_fail; - - btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_fail; - - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, - dentry->d_name.len, 0, index); - if (err) - goto out_fail; - - d_instantiate_new(dentry, inode); - -out_fail: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6694,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_io_tree *io_tree = &inode->io_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -6857,8 +7011,6 @@ next: } flush_dcache_page(page); } - set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } not_found: @@ -6892,133 +7044,6 @@ out: return em; } -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len) -{ - struct extent_map *em; - struct extent_map *hole_em = NULL; - u64 delalloc_start = start; - u64 end; - u64 delalloc_len; - u64 delalloc_end; - int err = 0; - - em = btrfs_get_extent(inode, NULL, 0, start, len); - if (IS_ERR(em)) - return em; - /* - * If our em maps to: - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - - /* check to see if we've wrapped (len == -1 or similar) */ - end = start + len; - if (end < start) - end = (u64)-1; - else - end -= 1; - - em = NULL; - - /* ok, we didn't find anything, lets look for delalloc */ - delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, - end, len, EXTENT_DELALLOC, 1); - delalloc_end = delalloc_start + delalloc_len; - if (delalloc_end < delalloc_start) - delalloc_end = (u64)-1; - - /* - * We didn't find anything useful, return the original results from - * get_extent() - */ - if (delalloc_start > end || delalloc_end <= start) { - em = hole_em; - hole_em = NULL; - goto out; - } - - /* - * Adjust the delalloc_start to make sure it doesn't go backwards from - * the start they passed in - */ - delalloc_start = max(start, delalloc_start); - delalloc_len = delalloc_end - delalloc_start; - - if (delalloc_len > 0) { - u64 hole_start; - u64 hole_len; - const u64 hole_end = extent_map_end(hole_em); - - em = alloc_extent_map(); - if (!em) { - err = -ENOMEM; - goto out; - } - - ASSERT(hole_em); - /* - * When btrfs_get_extent can't find anything it returns one - * huge hole - * - * Make sure what it found really fits our range, and adjust to - * make sure it is based on the start from the caller - */ - if (hole_end <= start || hole_em->start > end) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = hole_end - hole_start; - } - - if (hole_em && delalloc_start > hole_start) { - /* - * Our hole starts before our delalloc, so we have to - * return just the parts of the hole that go until the - * delalloc starts - */ - em->len = min(hole_len, delalloc_start - hole_start); - em->start = hole_start; - em->orig_start = hole_start; - /* - * Don't adjust block start at all, it is fixed at - * EXTENT_MAP_HOLE - */ - em->block_start = hole_em->block_start; - em->block_len = hole_len; - if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } else { - /* - * Hole is out of passed range or it starts after - * delalloc range - */ - em->start = delalloc_start; - em->len = delalloc_len; - em->orig_start = delalloc_start; - em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = delalloc_len; - } - } else { - return hole_em; - } -out: - - free_extent_map(hole_em); - if (err) { - free_extent_map(em); - return ERR_PTR(err); - } - return em; -} - static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, @@ -7040,12 +7065,16 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, - block_len, type); + ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, + block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); } em = ERR_PTR(ret); } @@ -7116,9 +7145,10 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict) + u64 *ram_bytes, bool nowait, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -7126,35 +7156,29 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 disk_bytenr; - u64 backref_offset; - u64 extent_end; - u64 num_bytes; - int slot; int found_type; - bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->nowait = nowait; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); if (ret < 0) goto out; - slot = path->slots[0]; if (ret == 1) { - if (slot == 0) { + if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } - slot--; + path->slots[0]--; } ret = 0; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ @@ -7166,55 +7190,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, goto out; } - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, fi); - if (found_type != BTRFS_FILE_EXTENT_REG && - found_type != BTRFS_FILE_EXTENT_PREALLOC) { - /* not a regular extent, must cow */ - goto out; - } - - if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + if (btrfs_file_extent_end(path) <= offset) goto out; - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end <= offset) - goto out; + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (ram_bytes) + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (disk_bytenr == 0) - goto out; + nocow_args.start = offset; + nocow_args.end = offset + *len - 1; + nocow_args.strict = strict; + nocow_args.free_path = true; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out; + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + /* can_nocow_file_extent() has freed the path. */ + path = NULL; - /* - * Do the same check as in btrfs_cross_ref_exist but without the - * unnecessary search. - */ - if (!strict && - (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item))) + if (ret != 1) { + /* Treat errors as not being able to NOCOW. */ + ret = 0; goto out; - - backref_offset = btrfs_file_extent_offset(leaf, fi); - - if (orig_start) { - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - if (btrfs_extent_readonly(fs_info, disk_bytenr)) + ret = 0; + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) goto out; - num_bytes = min(offset + *len, extent_end) - offset; - if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + num_bytes, + range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); @@ -7224,36 +7231,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - btrfs_release_path(path); - - /* - * look for other files referencing this extent, if we - * find any we must cow - */ + if (orig_start) + *orig_start = key.offset - nocow_args.extent_offset; + if (orig_block_len) + *orig_block_len = nocow_args.disk_num_bytes; - ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr, - strict); - if (ret) { - ret = 0; - goto out; - } - - /* - * adjust disk_bytenr and num_bytes to cover just the bytes - * in this extent we are about to write. If there - * are any csums in that range we have to cow in order - * to keep the csums correct - */ - disk_bytenr += backref_offset; - disk_bytenr += offset - key.offset; - if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) - goto out; - /* - * all of the above have passed, it is safe to overwrite this extent - * without cow - */ - *len = num_bytes; + *len = nocow_args.num_bytes; ret = 1; out: btrfs_free_path(path); @@ -7261,14 +7244,22 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, + unsigned int iomap_flags) { + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend)) + return -EAGAIN; + } else { + lock_extent(io_tree, lockstart, lockend, cached_state); + } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered @@ -7289,10 +7280,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it @@ -7312,7 +7307,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1); else - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -7328,7 +7323,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * ordered extent to complete while holding a lock on * that page. */ - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) @@ -7347,7 +7342,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type) { - struct extent_map_tree *em_tree; struct extent_map *em; int ret; @@ -7356,7 +7350,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7377,18 +7370,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->compress_type = compress_type; } - do { - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - /* - * The caller has taken lock_extent(), who could race with us - * to add em? - */ - } while (ret == -EEXIST); - + ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { free_extent_map(em); return ERR_PTR(ret); @@ -7402,14 +7384,18 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len) + u64 start, u64 len, + unsigned int iomap_flags) { + const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7432,21 +7418,27 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) - can_nocow = true; + &orig_block_len, &ram_bytes, false, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } } + prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; goto out; } space_reserved = true; @@ -7455,27 +7447,40 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { ret = PTR_ERR(em2); goto out; } - } else { - const u64 prev_len = len; + dio_data->nocow_done = true; + } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - /* We have to COW, so need to reserve metadata and data space. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, len); + if (nowait) + return -EAGAIN; + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) + return -ENOSPC; + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); if (ret < 0) goto out; space_reserved = true; @@ -7488,17 +7493,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start + len, prev_len - len, - true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); } /* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered @@ -7509,15 +7512,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); - if (can_nocow) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start, len, true); - extent_changeset_free(dio_data->data_reserved); - dio_data->data_reserved = NULL; - } + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } return ret; } @@ -7526,51 +7521,104 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = NULL; + struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; + const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } } - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - iomap->private = dio_data; + memset(dio_data, 0, sizeof(*dio_data)); + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len, false); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } /* * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered. + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { - ret = -ENOTBLK; + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) goto err; - } em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { @@ -7595,19 +7643,77 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } len = min(len, em->len - (start - em->start)); + + /* + * If we have a NOWAIT request and the range contains multiple extents + * (or a mix of extents and holes), then we return -EAGAIN to make the + * caller fallback to a context where it can do a blocking (without + * NOWAIT) request. This way we avoid doing partial IO and returning + * success to the caller, which is not optimal for writes and for reads + * it can result in unexpected behaviour for an application. + * + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling + * iomap_dio_rw(), we can end up returning less data then what the caller + * asked for, resulting in an unexpected, and incorrect, short read. + * That is, the caller asked to read N bytes and we return less than that, + * which is wrong unless we are crossing EOF. This happens if we get a + * page fault error when trying to fault in pages for the buffer that is + * associated to the struct iov_iter passed to iomap_dio_rw(), and we + * have previously submitted bios for other extents in the range, in + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of + * those bios have completed by the time we get the page fault error, + * which we return back to our caller - we should only return EIOCBQUEUED + * after we have submitted bios for all the extents in the range. + */ + if ((flags & IOMAP_NOWAIT) && len < length) { + free_extent_map(em); + ret = -EAGAIN; + goto unlock_err; + } + if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + start, len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } } else { /* * We need to unlock only the end area that we aren't using. @@ -7619,8 +7725,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } if (unlock_extents) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); else free_extent_state(cached_state); @@ -7649,10 +7755,15 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, return 0; unlock_err: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: - kfree(dio_data); + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } return ret; } @@ -7660,35 +7771,33 @@ err: static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + NULL); + return 0; } if (submitted < length) { pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1); + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) extent_changeset_free(dio_data->data_reserved); -out: - kfree(dio_data); - iomap->private = NULL; - return ret; } @@ -7701,40 +7810,31 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->dio_bio->bi_status); + if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1); + dip->file_offset + dip->bytes - 1, NULL); } - bio_endio(dip->dio_bio); - kfree(dip); + kfree(dip->csums); + bio_endio(&dip->bio); } -static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { - struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - refcount_inc(&dip->refs); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) - refcount_dec(&dip->refs); - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7743,72 +7843,45 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - const u64 orig_file_offset = dip->file_offset; - u64 start = orig_file_offset; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ASSERT((start - orig_file_offset) < UINT_MAX); - ret = btrfs_repair_one_sector(inode, - &bbio->bio, - start - orig_file_offset, - bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - start += sectorsize; - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + if (uptodate && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { + btrfs_clean_io_failure(BTRFS_I(inode), start, + bv.bv_page, bv.bv_offset); + } else { + int ret; + + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } - return err; -} -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); + return err; } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } -static void btrfs_end_dio_bio(struct bio *bio) +static void btrfs_end_dio_bio(struct btrfs_bio *bbio) { - struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_dio_private *dip = bbio->private; + struct bio *bio = &bbio->bio; blk_status_t err = bio->bi_status; if (err) @@ -7819,108 +7892,65 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); + err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) - dip->dio_bio->bi_status = err; + dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); + btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - goto err; - } + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* Check btrfs_submit_data_write_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return; + /* * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); - if (ret) - goto err; + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; -} - -/* - * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked - * or ordered extents whether or not we submit any bios. - */ -static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, - struct inode *inode, - loff_t file_offset) -{ - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - size_t dip_size; - struct btrfs_dio_private *dip; - - dip_size = sizeof(*dip); - if (!write && csum) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - size_t nblocks; - - nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; - dip_size += fs_info->csum_size * nblocks; - } - - dip = kzalloc(dip_size, GFP_NOFS); - if (!dip) - return NULL; - - dip->inode = inode; - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; - dip->dio_bio = dio_bio; - refcount_set(&dip->refs, 1); - return dip; + btrfs_submit_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct btrfs_dio_private *dip = + container_of(dio_bio, struct btrfs_dio_private, bio); struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); - struct btrfs_dio_private *dip; struct bio *bio; u64 start_sector; int async_submit = 0; @@ -7931,27 +7961,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iter->iomap.private; + struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip = btrfs_create_dio_private(dio_bio, inode, file_offset); - if (!dip) { - if (!write) { - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - } - dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return; - } + dip->inode = inode; + dip->file_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + refcount_set(&dip->refs, 1); + dip->csums = NULL; + + if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + unsigned int nr_sectors = + (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. - * - * If we have csums disabled this will do nothing. */ + status = BLK_STS_RESOURCE; + dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); + if (!dip->csums) + goto out_err; + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; @@ -7982,9 +8013,9 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, + btrfs_end_dio_bio, dip); + btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, @@ -8019,14 +8050,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8040,70 +8064,66 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, out_err_em: free_extent_map(em); out_err: - dip->dio_bio->bi_status = status; + dio_bio->bi_status = status; btrfs_dio_private_put(dip); } -const struct iomap_ops btrfs_dio_iomap_ops = { +static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -const struct iomap_dio_ops btrfs_dio_ops = { +static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, + .bio_set = &btrfs_dio_bioset, }; -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - int ret; + struct btrfs_dio_data data; - ret = fiemap_prep(inode, fieinfo, start, &len, 0); - if (ret) - return ret; - - return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); } -int btrfs_readpage(struct file *file, struct page *page) +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + struct btrfs_dio_data data; - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) - ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); - return ret; + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { - struct inode *inode = page->mapping->host; - int ret; + int ret; - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } + ret = fiemap_prep(inode, fieinfo, start, &len, 0); + if (ret) + return ret; /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) + return ret; } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; + + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } static int btrfs_writepages(struct address_space *mapping, @@ -8118,8 +8138,8 @@ static void btrfs_readahead(struct readahead_control *rac) } /* - * For releasepage() and invalidatepage() we have a race window where - * end_page_writeback() is called but the subpage spinlock is not yet released. + * For release_folio() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. @@ -8129,7 +8149,7 @@ static void wait_subpage_spinlock(struct page *page) struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -8150,105 +8170,99 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - int ret = try_release_extent_mapping(page, gfp_flags); + int ret = try_release_extent_mapping(&folio->page, gfp_flags); if (ret == 1) { - wait_subpage_spinlock(page); - clear_page_extent_mapped(page); + wait_subpage_spinlock(&folio->page); + clear_page_extent_mapped(&folio->page); } return ret; } -static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; - return __btrfs_releasepage(page, gfp_flags); + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; + return __btrfs_release_folio(folio, gfp_flags); } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif -static void btrfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* - * We have page locked so no new ordered extent can be created on this - * page, nor bio can be submitted for this page. + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. * - * But already submitted bio can still be finished on this page. - * Furthermore, endio function won't skip page which has Ordered + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and - * invalidatepage to do the same ordered extent accounting twice - * on one page. + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. * * So here we wait for any submitted bios to finish, so that we won't - * do double ordered extent accounting on the same page. + * do double ordered extent accounting on the same folio. */ - wait_on_page_writeback(page); - wait_subpage_spinlock(page); + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. - * If the range doesn't cover the full page, we don't need to and - * shouldn't clear page extent mapped, as page->private can still + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * - * For cases that can invalidate the full even the range doesn't - * cover the full page, like invalidating the last page, we're + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ - if (!(offset == 0 && length == PAGE_SIZE)) { - btrfs_releasepage(page, GFP_NOFS); + if (!(offset == 0 && length == folio_size(folio))) { + btrfs_release_folio(folio, GFP_NOFS); return; } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, &cached_state); + lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; - bool delete_states; u64 range_end; u32 range_len; + u32 extra_flags = 0; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); @@ -8258,7 +8272,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } if (ordered->file_offset > cur) { @@ -8269,7 +8283,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } @@ -8277,17 +8291,16 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ - delete_states = false; goto next; } - btrfs_page_clear_ordered(fs_info, page, cur, range_len); + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8301,7 +8314,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8309,6 +8322,12 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree.lock); + /* + * If the ordered extent has finished, we're safe to delete all + * the extent states of the range, otherwise + * btrfs_finish_ordered_io() will get executed by endio for + * other pages, so we can't delete extent states. + */ if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); @@ -8316,14 +8335,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ - delete_states = true; - } else { - /* - * btrfs_finish_ordered_io() will get executed by endio - * of other pages, thus we can't delete extent states - * anymore - */ - delete_states = false; + extra_flags = EXTENT_CLEAR_ALL_BITS; } next: if (ordered) @@ -8347,8 +8359,8 @@ next: if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete_states, &cached_state); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | + extra_flags, &cached_state); } cur = range_end + 1; } @@ -8357,11 +8369,11 @@ next: * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ - ASSERT(!PageOrdered(page)); - btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(page, GFP_NOFS); - clear_page_extent_mapped(page); + __btrfs_release_folio(folio, GFP_NOFS); + clear_page_extent_mapped(&folio->page); } /* @@ -8409,7 +8421,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -8439,11 +8451,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, &cached_state); + lock_extent(io_tree, page_start, page_end, &cached_state); ret2 = set_page_extent_mapped(page); if (ret2 < 0) { ret = vmf_error(ret2); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -8454,8 +8466,7 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); @@ -8483,13 +8494,12 @@ again: */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -8500,17 +8510,16 @@ again: else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); @@ -8586,7 +8595,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space @@ -8611,24 +8620,24 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), - ALIGN(new_size, fs_info->sectorsize), - (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -8706,51 +8715,28 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); return ret; } -/* - * create a new subvolume directory/inode (helper for the ioctl). - */ -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns) +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir) { struct inode *inode; - int err; - u64 index = 0; - u64 ino; - - err = btrfs_get_free_objectid(new_root, &ino); - if (err < 0) - return err; - inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, - ino, ino, - S_IFDIR | (~current_umask() & S_IRWXUGO), - &index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - set_nlink(inode, 1); - btrfs_i_size_write(BTRFS_I(inode), 0); - unlock_new_inode(inode); - - err = btrfs_subvol_inherit_props(trans, new_root, parent_root); - if (err) - btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d", - new_root->root_key.objectid, err); - - err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); - - iput(inode); - return err; + inode = new_inode(dir->i_sb); + if (inode) { + /* + * Subvolumes don't inherit the sgid bit or the parent's gid if + * the parent's sgid bit is set. This is probably a bug. + */ + inode_init_owner(mnt_userns, inode, NULL, + S_IFDIR | (~current_umask() & S_IRWXUGO)); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + } + return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -8759,7 +8745,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -8782,6 +8768,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); + spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8798,12 +8785,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); - extent_io_tree_init(fs_info, &ei->io_failure_tree, - IO_TREE_INODE_IO_FAILURE, inode); extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, inode); - ei->io_tree.track_uptodate = true; - ei->io_failure_tree.track_uptodate = true; + IO_TREE_INODE_FILE_EXTENT, NULL); + ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8818,7 +8802,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif @@ -8833,6 +8817,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; + bool freespace_inode; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); @@ -8854,6 +8839,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!root) return; + /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -8862,6 +8853,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); + + if (!freespace_inode) + btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); + btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8869,7 +8864,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } @@ -8890,7 +8885,7 @@ int btrfs_drop_inode(struct inode *inode) static void init_once(void *foo) { - struct btrfs_inode *ei = (struct btrfs_inode *) foo; + struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } @@ -8902,6 +8897,7 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); + bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); @@ -8942,6 +8938,11 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_bitmap_cachep) goto fail; + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bio), + BIOSET_NEED_BVECS)) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -8997,19 +8998,20 @@ static int btrfs_rename_exchange(struct inode *old_dir, { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); + struct btrfs_rename_ctx old_rename_ctx; + struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; - bool root_log_pinned = false; - bool dest_log_pinned = false; bool need_abort = false; /* @@ -9028,14 +9030,37 @@ static int btrfs_rename_exchange(struct inode *old_dir, down_read(&fs_info->subvol_sem); /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items - * should cover the worst case number of items we'll modify. + * For each inode: + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + * 1 to update parent inode + * + * If the parents are the same, we only need to account for one */ - trans = btrfs_start_transaction(root, 12); + trans_num_items = (old_dir == new_dir ? 9 : 10); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode item + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + trans_num_items += 4; + else + trans_num_items += 3; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -9100,8 +9125,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9112,29 +9139,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } - /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. - * - * We pin the logs even if at this precise moment none of the inodes was - * logged before. This is because right after we checked for that, some - * other task fsyncing some other inode not involved with this rename - * operation could log that one of our inodes exists. - * - * We don't need to pin the logs before the above calls to - * btrfs_insert_inode_ref(), since those don't ever need to change a log. - */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(root); - root_log_pinned = true; - } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; - } - /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9142,7 +9146,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9158,7 +9163,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, - new_dentry->d_name.len); + new_dentry->d_name.len, + &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9188,46 +9194,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (root_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), - old_dentry->d_parent); - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } -out_fail: /* - * If we have pinned a log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. */ - if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) - btrfs_set_log_full_commit(trans); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); - if (root_log_pinned) { - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } - } + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + old_rename_ctx.index, new_dentry->d_parent); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), + new_rename_ctx.index, old_dentry->d_parent); + + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9238,56 +9229,19 @@ out_notrans: return ret; } -static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry) +static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, + struct inode *dir) { - int ret; struct inode *inode; - u64 objectid; - u64 index; - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - return ret; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, - dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), - objectid, - S_IFCHR | WHITEOUT_MODE, - &index); - - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - return ret; + inode = new_inode(dir->i_sb); + if (inode) { + inode_init_owner(mnt_userns, inode, dir, + S_IFCHR | WHITEOUT_MODE); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } - - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, - WHITEOUT_DEV); - - ret = btrfs_init_inode_security(trans, inode, dir, - &dentry->d_name); - if (ret) - goto out; - - ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); -out: - unlock_new_inode(inode); - if (ret) - inode_dec_link_count(inode); - iput(inode); - - return ret; + return inode; } static int btrfs_rename(struct user_namespace *mnt_userns, @@ -9296,17 +9250,21 @@ static int btrfs_rename(struct user_namespace *mnt_userns, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_new_inode_args whiteout_args = { + .dir = old_dir, + .dentry = old_dentry, + }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); + struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); - bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9350,23 +9308,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); - /* close the racy window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (flags & RENAME_WHITEOUT) { + whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + if (!whiteout_args.inode) + return -ENOMEM; + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); + if (ret) + goto out_whiteout_inode; + } else { + /* 1 to update the old parent inode. */ + trans_num_items = 1; + } + + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume they are normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - * If our rename has the whiteout flag, we need more 5 units for the - * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item - * when selinux is enabled). - */ - trans_num_items = 11; - if (flags & RENAME_WHITEOUT) + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + */ + trans_num_items += 4; + /* 1 to update new parent inode if it's not the same as the old parent */ + if (new_dir != old_dir) + trans_num_items++; + if (new_inode) { + /* + * 1 to update inode + * 1 to remove inode ref + * 1 to remove dir item + * 1 to remove dir index + * 1 to possibly add orphan item + */ trans_num_items += 5; + } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -9400,9 +9391,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9411,29 +9404,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { - /* - * Now pin the log. We do it to ensure that no other task can - * sync the log while we are in progress with the rename, as - * that could result in an inconsistency in case any of the - * inodes that are part of this rename operation were logged - * before. - * - * We pin the log even if at this precise moment none of the - * inodes was logged before. This is because right after we - * checked for that, some other task fsyncing some other inode - * not involved with this rename operation could log that one of - * our inodes exists. - * - * We don't need to pin the logs before the above call to - * btrfs_insert_inode_ref(), since that does not need to change - * a log. - */ - btrfs_pin_log_trans(root); - log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, - old_dentry->d_name.len); + old_dentry->d_name.len, + &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9475,51 +9450,32 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (log_pinned) { - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - new_dentry->d_parent); - btrfs_end_log_trans(root); - log_pinned = false; - } + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), + rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, - old_dir, old_dentry); - + ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; + } else { + unlock_new_inode(whiteout_args.inode); + iput(whiteout_args.inode); + whiteout_args.inode = NULL; } } out_fail: - /* - * If we have pinned the log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && log_pinned) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(trans); - - btrfs_end_log_trans(root); - log_pinned = false; - } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - + if (flags & RENAME_WHITEOUT) + btrfs_new_inode_args_destroy(&whiteout_args); +out_whiteout_inode: + if (flags & RENAME_WHITEOUT) + iput(whiteout_args.inode); return ret; } @@ -9527,15 +9483,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + return ret; } struct btrfs_delalloc_work { @@ -9738,10 +9700,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + }; + unsigned int trans_num_items; int err; - u64 objectid; - u64 index = 0; int name_len; int datasize; unsigned long ptr; @@ -9752,49 +9717,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; - /* - * 2 items for inode item and ref - * 2 items for dir items - * 1 item for updating parent inode item - * 1 item for the inline extent item - * 1 item for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 7); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + btrfs_i_size_write(BTRFS_I(inode), name_len); + inode_set_bytes(inode, name_len); - err = btrfs_get_free_objectid(root, &objectid); + new_inode_args.inode = inode; + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; + /* 1 additional item for the inline extent */ + trans_num_items++; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFLNK | S_IRWXUGO, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + err = btrfs_create_new_inode(trans, &new_inode_args); if (err) - goto out_unlock; + goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_unlock; + btrfs_abort_transaction(trans, err); + discard_new_inode(inode); + inode = NULL; + goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; @@ -9803,8 +9759,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { + btrfs_abort_transaction(trans, err); btrfs_free_path(path); - goto out_unlock; + discard_new_inode(inode); + inode = NULL; + goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -9822,31 +9781,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode_set_bytes(inode, name_len); - btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - /* - * Last step, add directory indexes for our symlink inode. This is the - * last step to avoid extra cleanup of these indexes if an error happens - * elsewhere above. - */ - if (!err) - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (err) - goto out_unlock; - d_instantiate_new(dentry, inode); - -out_unlock: + err = 0; +out: btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } @@ -9895,6 +9839,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; @@ -9932,7 +9877,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -9988,13 +9932,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset -1, 0); - em = alloc_extent_map(); if (!em) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, + cur_offset + ins.offset - 1, false); + btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10008,16 +9950,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset - 1, - 0); - } + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); free_extent_map(em); next: num_bytes -= ins.offset; @@ -10076,11 +10009,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { @@ -10098,68 +10026,64 @@ static int btrfs_permission(struct user_namespace *mnt_userns, } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - u64 objectid; - u64 index; - int ret = 0; - - /* - * 5 units required for adding orphan entry - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - goto out; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - inode = NULL; - goto out; - } + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = file->f_path.dentry, + .orphan = true, + }; + unsigned int trans_num_items; + int ret; + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - ret = btrfs_init_inode_security(trans, inode, dir, NULL); + new_inode_args.inode = inode; + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) - goto out; + goto out_inode; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) - goto out; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) - goto out; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_new_inode_args; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); /* - * We set number of links to 0 in btrfs_new_inode(), and here we set - * it to 1 because d_tmpfile() will issue a warning if the count is 0, - * through: + * We set number of links to 0 in btrfs_create_new_inode(), and here we + * set it to 1 because d_tmpfile() will issue a warning if the count is + * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - mark_inode_dirty(inode); -out: + + if (!ret) { + d_tmpfile(file, inode); + unlock_new_inode(inode); + mark_inode_dirty(inode); + } + btrfs_end_transaction(trans); - if (ret && inode) - discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); - return ret; +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (ret) + iput(inode); + return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) @@ -10182,6 +10106,729 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) +{ + switch (compress_type) { + case BTRFS_COMPRESS_NONE: + return BTRFS_ENCODED_IO_COMPRESSION_NONE; + case BTRFS_COMPRESS_ZLIB: + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; + case BTRFS_COMPRESS_LZO: + /* + * The LZO format depends on the sector size. 64K is the maximum + * sector size that we support. + */ + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) + return -EINVAL; + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + + (fs_info->sectorsize_bits - 12); + case BTRFS_COMPRESS_ZSTD: + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; + default: + return -EUCLEAN; + } +} + +static ssize_t btrfs_encoded_read_inline( + struct kiocb *iocb, + struct iov_iter *iter, u64 start, + u64 lockend, + struct extent_state **cached_state, + u64 extent_start, size_t count, + struct btrfs_ioctl_encoded_io_args *encoded, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *item; + u64 ram_bytes; + unsigned long ptr; + void *tmp; + ssize_t ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + extent_start, 0); + if (ret) { + if (ret > 0) { + /* The extent item disappeared? */ + ret = -EIO; + } + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + ptr = btrfs_file_extent_inline_start(item); + + encoded->len = min_t(u64, extent_start + ram_bytes, + inode->vfs_inode.i_size) - iocb->ki_pos; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, item)); + if (ret < 0) + goto out; + encoded->compression = ret; + if (encoded->compression) { + size_t inline_size; + + inline_size = btrfs_file_extent_inline_item_len(leaf, + path->slots[0]); + if (inline_size > count) { + ret = -ENOBUFS; + goto out; + } + count = inline_size; + encoded->unencoded_len = ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - extent_start; + } else { + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + ptr += iocb->ki_pos - extent_start; + } + + tmp = kmalloc(count, GFP_NOFS); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(leaf, tmp, ptr, count); + btrfs_release_path(path); + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + ret = copy_to_iter(tmp, count, iter); + if (ret != count) + ret = -EFAULT; + kfree(tmp); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_encoded_read_private { + struct btrfs_inode *inode; + u64 file_offset; + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; + bool skip_csum; +}; + +static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, + struct bio *bio, int mirror_num) +{ + struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + blk_status_t ret; + + if (!priv->skip_csum) { + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); + if (ret) + return ret; + } + + atomic_inc(&priv->pending); + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; +} + +static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) +{ + const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); + struct btrfs_encoded_read_private *priv = bbio->private; + struct btrfs_inode *inode = priv->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 sectorsize = fs_info->sectorsize; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + u32 bio_offset = 0; + + if (priv->skip_csum || !uptodate) + return bbio->bio.bi_status; + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + unsigned int i, nr_sectors, pgoff; + + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); + pgoff = bvec->bv_offset; + for (i = 0; i < nr_sectors; i++) { + ASSERT(pgoff < PAGE_SIZE); + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) + return BLK_STS_IOERR; + bio_offset += sectorsize; + pgoff += sectorsize; + } + } + return BLK_STS_OK; +} + +static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) +{ + struct btrfs_encoded_read_private *priv = bbio->private; + blk_status_t status; + + status = btrfs_encoded_read_verify_csum(bbio); + if (status) { + /* + * The memory barrier implied by the atomic_dec_return() here + * pairs with the memory barrier implied by the + * atomic_dec_return() or io_wait_event() in + * btrfs_encoded_read_regular_fill_pages() to ensure that this + * write is observed before the load of status in + * btrfs_encoded_read_regular_fill_pages(). + */ + WRITE_ONCE(priv->status, status); + } + if (!atomic_dec_return(&priv->pending)) + wake_up(&priv->wait); + btrfs_bio_free_csum(bbio); + bio_put(&bbio->bio); +} + +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { + .inode = inode, + .file_offset = file_offset, + .pending = ATOMIC_INIT(1), + .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), + }; + unsigned long i = 0; + u64 cur = 0; + int ret; + + init_waitqueue_head(&priv.wait); + /* + * Submit bios for the extent, splitting due to bio or stripe limits as + * necessary. + */ + while (cur < disk_io_size) { + struct extent_map *em; + struct btrfs_io_geometry geom; + struct bio *bio = NULL; + u64 remaining; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, + disk_io_size - cur); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + } else { + ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, + disk_bytenr + cur, &geom); + free_extent_map(em); + } + if (ret) { + WRITE_ONCE(priv.status, errno_to_blk_status(ret)); + break; + } + remaining = min(geom.len, disk_io_size - cur); + while (bio || remaining) { + size_t bytes = min_t(u64, remaining, PAGE_SIZE); + + if (!bio) { + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + btrfs_encoded_read_endio, + &priv); + bio->bi_iter.bi_sector = + (disk_bytenr + cur) >> SECTOR_SHIFT; + } + + if (!bytes || + bio_add_page(bio, pages[i], bytes, 0) < bytes) { + blk_status_t status; + + status = submit_encoded_read_bio(inode, bio, 0); + if (status) { + WRITE_ONCE(priv.status, status); + bio_put(bio); + goto out; + } + bio = NULL; + continue; + } + + i++; + cur += bytes; + remaining -= bytes; + } + } + +out: + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); + /* See btrfs_encoded_read_endio() for ordering. */ + return blk_status_to_errno(READ_ONCE(priv.status)); +} + +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, + struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + bool *unlocked) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + unsigned long nr_pages, i; + u64 cur; + size_t page_offset; + ssize_t ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages); + if (ret) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, + disk_io_size, pages); + if (ret) + goto out; + + unlock_extent(io_tree, start, lockend, cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + *unlocked = true; + + if (compressed) { + i = 0; + page_offset = 0; + } else { + i = (iocb->ki_pos - start) >> PAGE_SHIFT; + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); + } + cur = 0; + while (cur < count) { + size_t bytes = min_t(size_t, count - cur, + PAGE_SIZE - page_offset); + + if (copy_page_to_iter(pages[i], page_offset, bytes, + iter) != bytes) { + ret = -EFAULT; + goto out; + } + i++; + cur += bytes; + page_offset = 0; + } + ret = count; +out: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + return ret; +} + +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + ssize_t ret; + size_t count = iov_iter_count(iter); + u64 start, lockend, disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; + struct extent_map *em; + bool unlocked = false; + + file_accessed(iocb->ki_filp); + + btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + + if (iocb->ki_pos >= inode->vfs_inode.i_size) { + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return 0; + } + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); + /* + * We don't know how long the extent containing iocb->ki_pos is, but if + * it's compressed we know that it won't be longer than this. + */ + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + lock_extent(io_tree, start, lockend, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, &cached_state); + cond_resched(); + } + + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_extent; + } + + if (em->block_start == EXTENT_MAP_INLINE) { + u64 extent_start = em->start; + + /* + * For inline extents we get everything we need out of the + * extent item. + */ + free_extent_map(em); + em = NULL; + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, + &cached_state, extent_start, + count, encoded, &unlocked); + goto out; + } + + /* + * We only want to return up to EOF even if the extent extends beyond + * that. + */ + encoded->len = min_t(u64, extent_map_end(em), + inode->vfs_inode.i_size) - iocb->ki_pos; + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + disk_bytenr = EXTENT_MAP_HOLE; + count = min_t(u64, count, encoded->len); + encoded->len = count; + encoded->unencoded_len = count; + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + disk_bytenr = em->block_start; + /* + * Bail if the buffer isn't large enough to return the whole + * compressed extent. + */ + if (em->block_len > count) { + ret = -ENOBUFS; + goto out_em; + } + disk_io_size = em->block_len; + count = em->block_len; + encoded->unencoded_len = em->ram_bytes; + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; + ret = btrfs_encoded_io_compression_from_extent(fs_info, + em->compress_type); + if (ret < 0) + goto out_em; + encoded->compression = ret; + } else { + disk_bytenr = em->block_start + (start - em->start); + if (encoded->len > count) + encoded->len = count; + /* + * Don't read beyond what we locked. This also limits the page + * allocations that we'll do. + */ + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + disk_io_size - iocb->ki_pos; + encoded->len = count; + encoded->unencoded_len = count; + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + } + free_extent_map(em); + em = NULL; + + if (disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + unlocked = true; + ret = iov_iter_zero(count, iter); + if (ret != count) + ret = -EFAULT; + } else { + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + encoded->compression, + &unlocked); + } + +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; +out_em: + free_extent_map(em); +out_unlock_extent: + if (!unlocked) + unlock_extent(io_tree, start, lockend, &cached_state); +out_unlock_inode: + if (!unlocked) + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + return ret; +} + +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap_local_page(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap_local(kaddr); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap_local(kaddr); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, + false); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (ret) { + btrfs_drop_extent_map_range(inode, start, end, false); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, NULL, + false)) { + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); + ret = -EIO; + goto out_pages; + } + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a @@ -10390,12 +11037,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + lock_extent(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; @@ -10436,7 +11098,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { @@ -10532,7 +11194,7 @@ out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); - unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); @@ -10585,6 +11247,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } +/** + * Verify that there are no ordered extents for a given file range. + * + * @inode: The target inode. + * @start: Start offset of the file range, should be sector size aligned. + * @end: End offset (inclusive) of the file range, its value +1 should be + * sector size aligned. + * + * This should typically be used for cases where we locked an inode's VFS lock in + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range, we have waited for all ordered + * extents in the range to complete and finally we have locked the file range in + * the inode's io_tree. + */ +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct btrfs_ordered_extent *ordered; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); + if (ordered) { + btrfs_err(root->fs_info, +"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", + start, end, btrfs_ino(inode), root->root_key.objectid, + ordered->file_offset, + ordered->file_offset + ordered->num_bytes - 1); + btrfs_put_ordered_extent(ordered); + } + + ASSERT(ordered == NULL); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10633,17 +11330,14 @@ static const struct file_operations btrfs_dir_file_operations = { * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, + .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif - .set_page_dirty = btrfs_set_page_dirty, + .invalidate_folio = btrfs_invalidate_folio, + .release_folio = btrfs_release_folio, + .migrate_folio = btrfs_migrate_folio, + .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d8af6620a941..5ba2e810dc6e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -28,6 +28,7 @@ #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> +#include <linux/sched/xacct.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 { #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) + +struct btrfs_ioctl_encoded_io_args_32 { + compat_uptr_t iov; + compat_ulong_t iovcnt; + __s64 offset; + __u64 flags; + __u64 len; + __u64 unencoded_len; + __u64 unencoded_offset; + __u32 compression; + __u32 encryption; + __u8 reserved[64]; +}; + +#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) +#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ + struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ @@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, } } -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { - struct inode *inode = file_inode(file); - return put_user(inode->i_generation, arg); } @@ -451,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -481,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); @@ -527,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, - const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -542,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); - struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; int ret; - dev_t anon_dev = 0; + dev_t anon_dev; u64 objectid; - u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -554,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto fail_free; - - ret = get_anon_bdev(&anon_dev); - if (ret < 0) - goto fail_free; + goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be @@ -566,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; - goto fail_free; + goto out_root_item; + } + + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * The same as the snapshot creation, please see the comment - * of create_snapshot(). - */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); if (ret) - goto fail_free; + goto out_new_inode_args; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); - goto fail_free; + goto out_new_inode_args; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) - goto fail; + goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); - goto fail; + goto out; } btrfs_mark_buffer_dirty(leaf); @@ -650,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); - goto fail; + goto out; } free_extent_buffer(leaf); leaf = NULL; - key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { - free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - /* Freeing will be done in btrfs_put_root() of new_root */ + /* anon_dev is owned by new_root now. */ anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { - btrfs_put_root(new_root); - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); - btrfs_put_root(new_root); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, - btrfs_ino(BTRFS_I(dir)), index, name, namelen); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_uuid_tree_add(trans, root_item->uuid, - BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) - btrfs_abort_transaction(trans, ret); + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; -fail: - kfree(root_item); +out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); @@ -727,18 +748,14 @@ fail: btrfs_end_transaction(trans); else ret = btrfs_commit_transaction(trans); - - if (!ret) { - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return PTR_ERR(inode); - d_instantiate(dentry, inode); - } - return ret; - -fail_free: +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); +out_root_item: kfree(root_item); return ret; } @@ -750,9 +767,17 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; struct btrfs_trans_handle *trans; int ret; + /* We do not support snapshotting right now. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_warn(fs_info, + "extent tree v2 doesn't support snapshotting yet"); + return -EOPNOTSUPP; + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -780,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - parent dir inode - * 2 - dir entries - * 1 - root item - * 2 - root ref/backref - * 1 - root of snapshot - * 1 - UUID item + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item */ + trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 8, - false); + &pending_snapshot->block_rsv, + trans_num_items, false); if (ret) goto free_pending; @@ -805,10 +828,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) @@ -962,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1015,8 +1035,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1031,16 +1198,30 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; /* get the big lock and read metadata off disk */ if (!locked) - lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent_cached(io_tree, start, end, &cached); + unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1049,23 +1230,52 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - bool locked) + u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + ret = true; +out: free_extent_map(next); return ret; } @@ -1123,10 +1333,10 @@ again: while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); if (!ordered) break; @@ -1150,7 +1360,7 @@ again: * make it uptodate. */ if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping || !PagePrivate(page)) { unlock_page(page); @@ -1189,8 +1399,11 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1200,12 +1413,25 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; - /* Skip hole/inline/preallocated extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE || + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; @@ -1213,6 +1439,10 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + /* * Our start offset might be in the middle of an existing extent * map, so take that into account. @@ -1253,8 +1483,24 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (range_len >= extent_thresh) goto next; + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - locked); + extent_thresh, newer_than, locked); if (!next_mergeable) { struct defrag_target_range *last; @@ -1271,6 +1517,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1314,10 +1561,22 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } #define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); /* * Defrag one contiguous target range. @@ -1357,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, return ret; clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, cached_state); + EXTENT_DEFRAG, cached_state); set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); /* Update the page status */ @@ -1372,7 +1631,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1406,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, wait_on_page_writeback(pages[i]); /* Lock the pages range */ - lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1418,7 +1678,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1434,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); free_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) { @@ -1453,7 +1713,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1461,10 +1722,9 @@ static int defrag_one_cluster(struct btrfs_inode *inode, LIST_HEAD(target_list); int ret; - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; @@ -1481,6 +1741,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode, range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1493,7 +1762,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; *sectors_defragged += range_len >> @@ -1504,6 +1774,8 @@ out: list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1589,11 +1861,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; - /* The cluster size 256K should always be page aligned */ - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - if (btrfs_defrag_cancelled(fs_info)) { ret = -EAGAIN; break; @@ -1618,8 +1888,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1627,11 +1897,12 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); if (ret > 0) { ret = 0; break; } + cond_resched(); } if (ra_allocated) @@ -2009,10 +2280,9 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, +static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { - struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -2317,7 +2587,12 @@ static noinline int search_ioctl(struct inode *inode, while (1) { ret = -EFAULT; - if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -2342,26 +2617,22 @@ err: return ret; } -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) +static noinline int btrfs_ioctl_tree_search(struct inode *inode, + void __user *argp) { - struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; - struct inode *inode; int ret; size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; buf_size = sizeof(uargs->buf); - inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* @@ -2376,12 +2647,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct file *file, +static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; - struct inode *inode; int ret; size_t buf_size; const size_t buf_limit = SZ_16M; @@ -2390,7 +2660,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, return -EPERM; /* copy search header and buffer size */ - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; @@ -2400,7 +2669,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, if (buf_size > buf_limit) buf_size = buf_limit; - inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) @@ -2651,25 +2919,22 @@ out: return ret; } -static noinline int btrfs_ioctl_ino_lookup(struct file *file, +static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); - inode = file_inode(file); - /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; + args->treeid = root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2681,7 +2946,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, goto out; } - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); @@ -2737,7 +3002,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ -static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; @@ -2749,7 +3014,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; - struct inode *inode; int slot; int ret = 0; @@ -2763,7 +3027,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) return -ENOMEM; } - inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ @@ -2842,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; @@ -2857,15 +3122,14 @@ out_free: * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ -static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) +static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, + void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; - struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; - struct inode *inode; u64 objectid; int slot; int ret; @@ -2881,15 +3145,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) return PTR_ERR(rootrefs); } - inode = file_inode(file); - root = BTRFS_I(inode)->root->fs_info->tree_root; - objectid = BTRFS_I(inode)->root->root_key.objectid; - + objectid = root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; + root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; @@ -2934,6 +3196,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -2945,7 +3209,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } @@ -2969,6 +3232,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int err = 0; bool destroy_parent = false; + /* We don't support snapshots with extent tree v2 yet. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "extent tree v2 doesn't support snapshot deletion yet"); + return -EOPNOTSUPP; + } + if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) @@ -3244,6 +3514,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -3354,7 +3629,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3769,6 +4044,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + return -EINVAL; + } + sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); @@ -3868,6 +4148,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); + return -EINVAL; + } + p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); @@ -3949,6 +4234,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { @@ -3964,26 +4251,6 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -4019,21 +4286,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4045,7 +4311,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); @@ -4076,19 +4341,81 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; - if (!arg) - btrfs_warn(fs_info, - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4096,106 +4423,55 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; } - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; goto out_unlock; } - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); + goto do_balance; + } - goto do_balance; - } - } else { - bargs = NULL; + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; - goto out_bargs; + goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; - goto out_bargs; - } - - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + goto out_unlock; } - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { - ret = -EINVAL; - goto out_bctl; - } + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. @@ -4208,21 +4484,19 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if ((ret == 0 || ret == -ECANCELED) && arg) { + if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } -out_bctl: kfree(bctl); -out_bargs: - kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); + kfree(bargs); return ret; } @@ -4929,7 +5203,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4959,11 +5233,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(file, arg); + ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } +static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, + bool compat) +{ + struct btrfs_ioctl_encoded_io_args args = { 0 }; + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, + flags); + size_t copy_end; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, + flags); + if (copy_from_user(&args32, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + if (copy_from_user(&args, argp, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + if (args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_iov; + } + pos = args.offset; + ret = rw_verify_area(READ, file, &pos, args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + ret = btrfs_encoded_read(&kiocb, &iter, &args); + if (ret >= 0) { + fsnotify_access(file); + if (copy_to_user(argp + copy_end, + (char *)&args + copy_end_kernel, + sizeof(args) - copy_end_kernel)) + ret = -EFAULT; + } + +out_iov: + kfree(iov); +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + loff_t pos; + struct kiocb kiocb; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, argp, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + args.iov = compat_ptr(args32.iov); + args.iovcnt = args32.iovcnt; + args.offset = args32.offset; + args.flags = args32.flags; + args.len = args32.len; + args.unencoded_len = args32.unencoded_len; + args.unencoded_offset = args32.unencoded_offset; + args.compression = args32.compression; + args.encryption = args32.encryption; + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); +#else + return -ENOTTY; +#endif + } else { + if (copy_from_user(&args, argp, sizeof(args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (args.flags != 0) + goto out_acct; + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) + goto out_acct; + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (args.unencoded_offset > args.unencoded_len) + goto out_acct; + if (args.len > args.unencoded_len - args.unencoded_offset) + goto out_acct; + + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret < 0) + goto out_acct; + + file_start_write(file); + + if (iov_iter_count(&iter) == 0) { + ret = 0; + goto out_end_write; + } + pos = args.offset; + ret = rw_verify_area(WRITE, file, &pos, args.len); + if (ret < 0) + goto out_end_write; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0); + if (ret) + goto out_end_write; + kiocb.ki_pos = pos; + + ret = btrfs_do_write_iter(&kiocb, &iter, &args); + if (ret > 0) + fsnotify_modify(file); + +out_end_write: + file_end_write(file); + kfree(iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4974,7 +5431,7 @@ long btrfs_ioctl(struct file *file, unsigned int switch (cmd) { case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); + return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: @@ -4994,7 +5451,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); + return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5015,14 +5472,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); + return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(file, argp); + return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); + return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: @@ -5069,10 +5524,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(file, argp, false); + return _btrfs_ioctl_send(inode, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(file, argp, true); + return _btrfs_ioctl_send(inode, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); @@ -5099,15 +5554,25 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: - return btrfs_ioctl_get_subvol_info(file, argp); + return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: - return btrfs_ioctl_get_subvol_rootref(file, argp); + return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case BTRFS_IOC_ENCODED_READ: + return btrfs_ioctl_encoded_read(file, argp, false); + case BTRFS_IOC_ENCODED_WRITE: + return btrfs_ioctl_encoded_write(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: + return btrfs_ioctl_encoded_read(file, argp, true); + case BTRFS_IOC_ENCODED_WRITE_32: + return btrfs_ioctl_encoded_write(file, argp, true); +#endif } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..0eab3cb274a1 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -14,6 +14,93 @@ #include "locking.h" /* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. + * + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. + * + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. + * + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if BTRFS_MAX_LEVEL != 8 +#error +#endif + +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + /* Longest entry: btrfs-free-space-00 */ + char names[BTRFS_MAX_LEVEL][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = 0, DEFINE_NAME("tree") }, +}; + +#undef DEFINE_LEVEL +#undef DEFINE_NAME + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* Find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); +} + +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) +{ + if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + btrfs_set_buffer_lockdep_class(root->root_key.objectid, + eb, btrfs_header_level(eb)); +} + +#endif + +/* * Extent buffer locking * ===================== * @@ -45,7 +132,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +148,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +175,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } @@ -167,6 +251,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_lock(eb); if (eb == root->node) break; @@ -188,6 +274,8 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_read_lock(eb); if (eb == root->node) break; @@ -198,6 +286,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) } /* + * Loop around taking references on and locking the root node of the tree in + * nowait mode until we end up with a lock on the root node or returning to + * avoid blocking. + * + * Return: root extent buffer with read lock held or -EAGAIN. + */ +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + if (!btrfs_try_tree_read_lock(eb)) { + free_extent_buffer(eb); + return ERR_PTR(-EAGAIN); + } + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* * DREW locks * ========== * diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bbc45534ae9a..490c7a79e995 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) @@ -131,4 +132,18 @@ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb); +#else +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) +{ +} +static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, + struct extent_buffer *eb) +{ +} +#endif + #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 0fb90cbe7669..89bc5f825e0a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,6 +55,9 @@ * 0x1000 | SegHdr N+1| Data payload N+1 ... | */ +#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) +#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) + struct workspace { void *mem; void *buf; /* where decompressed data goes */ @@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level) return ERR_PTR(-ENOMEM); workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); - workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -152,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data, out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -164,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); - kunmap(cur_page); + kunmap_local(kaddr); if ((*cur_out / PAGE_SIZE) >= max_nr_page) return -E2BIG; @@ -177,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data, return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -199,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data, *cur_out += sector_bytes_left; out: - kunmap(cur_page); + kunmap_local(kaddr); return 0; } @@ -245,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap(page_in); + data_in = kmap_local_page(page_in); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); - kunmap(page_in); + kunmap_local(data_in); if (ret < 0) { pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; @@ -307,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -315,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; - kaddr = kmap(cur_page); - memcpy(dest + *cur_in - orig_in, - kaddr + offset_in_page(*cur_in), - copy_len); - kunmap(cur_page); + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); *cur_in += copy_len; } @@ -339,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap(cb->compressed_pages[0]); + kaddr = kmap_local_page(cb->compressed_pages[0]); len_in = read_compress_length(kaddr); - kunmap(cb->compressed_pages[0]); + kunmap_local(kaddr); cur_in += LZO_LEN; /* @@ -375,11 +374,22 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); - kunmap(cur_page); + kunmap_local(kaddr); cur_in += LZO_LEN; + if (seg_len > WORKSPACE_CBUF_LENGTH) { + /* + * seg_len shouldn't be larger than we have allocated + * for workspace->cbuf + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); + ret = -EIO; + goto out; + } + /* Copy the compressed segment payload into workspace */ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); @@ -422,7 +432,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; size_t out_len; - size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); + size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; char *kaddr; unsigned long bytes; diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 340f995652f2..f9850edfd726 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) return NULL; } +/* + * Search @root from an entry that starts or comes after @bytenr. + * + * @root: the root to search. + * @bytenr: bytenr to search from. + * + * Return the rb_node that start at or after @bytenr. If there is no entry at + * or after @bytner return NULL. + */ +static inline struct rb_node *rb_simple_search_first(struct rb_root *root, + u64 bytenr) +{ + struct rb_node *node = root->rb_node, *ret = NULL; + struct rb_simple_node *entry, *ret_entry = NULL; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) { + if (!ret || entry->bytenr < ret_entry->bytenr) { + ret = node; + ret_entry = entry; + } + + node = node->rb_left; + } else if (bytenr > entry->bytenr) { + node = node->rb_right; + } else { + return node; + } + } + + return ret; +} + static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, struct rb_node *node) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6b51fd2ec5ac..e54f8280031f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Allocate and add a new ordered_extent into the per-inode tree. +/** + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. * - * The tree is given a single reference on the ordered extent that was - * inserted. + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted. + * + * Return: 0 or -ENOMEM. */ -static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, int dio, - int compress_type) +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset struct btrfs_ordered_extent *entry; int ret; - if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) @@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return -ENOMEM; entry->file_offset = file_offset; - entry->disk_bytenr = disk_bytenr; entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; entry->bytes_left = num_bytes; entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; @@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->qgroup_rsv = ret; entry->physical = (u64)-1; - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC || - type == BTRFS_ORDERED_COMPRESSED); - set_bit(type, &entry->flags); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); + entry->flags = flags; percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, fs_info->delalloc_batch); - if (dio) - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return 0; } -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type) -{ - ASSERT(type == BTRFS_ORDERED_REGULAR || - type == BTRFS_ORDERED_NOCOW || - type == BTRFS_ORDERED_PREALLOC); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 1, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type) -{ - ASSERT(compress_type != BTRFS_COMPRESS_NONE); - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, - BTRFS_ORDERED_COMPRESSED, 0, - compress_type); -} - /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -298,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&tree->lock); } +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + /* * Mark all ordered extents io inside the specified range finished. * - * @page: The invovled page for the opeartion. + * @page: The involved page for the operation. * For uncompressed buffered IO, the page status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the * endio function twice. - * @finish_func: The function to be executed when all the IO of an ordered - * extent are finished. * * This function is called for endio, thus the range must have ordered - * extent(s) coveri it. + * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate) + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -427,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); spin_lock_irqsave(&tree->lock, flags); } @@ -499,6 +479,7 @@ out: if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } spin_unlock_irqrestore(&tree->lock, flags); return finished; @@ -543,14 +524,28 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; bool pending; + bool freespace_inode; + /* + * If this is a free space inode the thread has not acquired the ordered + * extents lockdep map. + */ + freespace_inode = btrfs_is_free_space_inode(btrfs_inode); + + btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, - false); + if (root != fs_info->tree_root) { + u64 release; + + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) + release = entry->disk_num_bytes; + else + release = entry->num_bytes; + btrfs_delalloc_release_metadata(btrfs_inode, release, false); + } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); @@ -593,6 +588,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } } + btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; @@ -607,6 +604,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); + if (!freespace_inode) + btrfs_lockdep_release(fs_info, btrfs_ordered_extent); } static void btrfs_run_ordered_extent_work(struct btrfs_work *work) @@ -725,10 +724,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; struct btrfs_inode *inode = BTRFS_I(entry->inode); + bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them @@ -736,6 +742,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } @@ -827,8 +835,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } out: spin_unlock_irqrestore(&tree->lock, flags); return entry; @@ -868,8 +878,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( break; } out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } spin_unlock_irq(&tree->lock); return entry; } @@ -898,6 +910,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, ASSERT(list_empty(&ordered->log_list)); list_add_tail(&ordered->log_list, list); refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } spin_unlock_irq(&tree->lock); } @@ -921,6 +934,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); out: spin_unlock_irq(&tree->lock); return entry; @@ -995,8 +1009,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( /* No ordered extent in the range */ entry = NULL; out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + spin_unlock_irq(&tree->lock); return entry; } @@ -1026,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent_bits(&inode->io_tree, start, end, cachedp); + lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1039,12 +1056,37 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent_cached(&inode->io_tree, start, end, cachedp); + unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } +/* + * Lock the passed range and ensure all pending ordered extents in it are run + * to completion in nowait mode. + * + * Return true if btrfs_lock_ordered_range does not return any extents, + * otherwise false. + */ +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, start, end)) + return false; + + ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); + if (!ordered) + return true; + + btrfs_put_ordered_extent(ordered); + unlock_extent(&inode->io_tree, start, end, NULL); + + return false; +} + + static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, u64 len) { @@ -1052,42 +1094,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; - u64 num_bytes = len; - u64 disk_num_bytes = len; - int type; - unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); - int compress_type = ordered->compress_type; - unsigned long weight; - int ret; - - weight = hweight_long(flags_masked); - WARN_ON_ONCE(weight > 1); - if (!weight) - type = 0; - else - type = __ffs(flags_masked); + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; /* - * The splitting extent is already counted and will be added again - * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid - * double counting. + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { - WARN_ON_ONCE(1); - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), - file_offset, disk_bytenr, num_bytes, - disk_num_bytes, compress_type); - } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } else { - ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, - disk_bytenr, num_bytes, disk_num_bytes, type); - } - - return ret; + WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, @@ -1099,6 +1117,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4194e960ff61..f59f2dbdb25e 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,8 +74,18 @@ enum { BTRFS_ORDERED_LOGGED_CSUM, /* We wait for this extent to complete in the current transaction */ BTRFS_ORDERED_PENDING, + /* BTRFS_IOC_ENCODED_WRITE */ + BTRFS_ORDERED_ENCODED, }; +/* BTRFS_ORDERED_* flags that specify the type of the extent. */ +#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED) | \ + (1UL << BTRFS_ORDERED_DIRECT) | \ + (1UL << BTRFS_ORDERED_ENCODED)) + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -84,9 +94,11 @@ struct btrfs_ordered_extent { * These fields directly correspond to the same fields in * btrfs_file_extent_item. */ - u64 disk_bytenr; u64 num_bytes; + u64 ram_bytes; + u64 disk_bytenr; u64 disk_num_bytes; + u64 offset; /* number of bytes that still need writing */ u64 bytes_left; @@ -148,18 +160,6 @@ struct btrfs_ordered_extent { struct block_device *bdev; }; -/* - * calculates the total size you need to allocate for an ordered sum - * structure spanning 'bytes' in the file - */ -static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, - unsigned long bytes) -{ - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; -} - static inline void btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) { @@ -168,25 +168,21 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate); + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, - int type); -int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, - u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int compress_type); + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned flags, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, @@ -210,6 +206,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0775ae9f4419..dd8777872143 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = { { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, }; @@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); - if (IS_ERR(next)) { + if (IS_ERR(next)) continue; - } else if (!extent_buffer_uptodate(next)) { + if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); continue; } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1a6d2d5b4b33..055a631276ce 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; - int (*validate)(const char *value, size_t len); + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -55,7 +57,8 @@ find_prop_handler(const char *name, return NULL; } -int btrfs_validate_prop(const char *name, const char *value, size_t value_len) +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) { const struct prop_handler *handler; @@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len) if (value_len == 0) return 0; - return handler->validate(value, value_len); + return handler->validate(inode, value, value_len); +} + +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, @@ -245,15 +270,16 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 ino = btrfs_ino(BTRFS_I(inode)); - int ret; - - ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); - return ret; + return iterate_object_props(root, path, ino, inode_prop_iterator, inode); } -static int prop_compression_validate(const char *value, size_t len) +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) { + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + if (!value) return 0; @@ -310,6 +336,22 @@ static int prop_compression_apply(struct inode *inode, const char *value, return 0; } +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { @@ -330,13 +372,13 @@ static struct prop_handler prop_handlers[] = { .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, + .ignore = prop_compression_ignore, .inheritable = 1 }, }; -static int inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *parent) +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -356,6 +398,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; + if (h->ignore(BTRFS_I(inode))) + continue; + value = h->extract(parent); if (!value) continue; @@ -364,7 +409,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(value, strlen(value)); + ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; @@ -408,41 +453,6 @@ static int inherit_props(struct btrfs_trans_handle *trans, return 0; } -int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *dir) -{ - if (!dir) - return 0; - - return inherit_props(trans, inode, dir); -} - -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root) -{ - struct super_block *sb = root->fs_info->sb; - struct inode *parent_inode, *child_inode; - int ret; - - parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); - if (IS_ERR(parent_inode)) - return PTR_ERR(parent_inode); - - child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); - if (IS_ERR(child_inode)) { - iput(parent_inode); - return PTR_ERR(child_inode); - } - - ret = inherit_props(trans, child_inode, parent_inode); - iput(child_inode); - iput(parent_inode); - - return ret; -} - void __init btrfs_props_init(void) { int i; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 40b2c65b518c..ca9dd3df129b 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -13,7 +13,9 @@ void __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags); -int btrfs_validate_prop(const char *name, const char *value, size_t value_len); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); @@ -21,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root); - #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8928275823a1..b74105a10f16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,18 +25,6 @@ #include "sysfs.h" #include "tree-mod-log.h" -/* TODO XXX FIXME - * - subvol delete -> delete when ref goes to 0? delete limits also? - * - reorganize keys - * - compressed - * - sync - * - copy also limits on subvol creation - * - limit - * - caches for ulists - * - performance benchmarks - * - check all ioctl parameters - */ - /* * Helpers to access qgroup reservation * @@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return 0; } -/* must be called with qgroup_lock held */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, - u64 memberid, u64 parentid) +/* + * Add relation specified by two qgroups. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the qgroups is NULL + * <0 other errors + */ +static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) { - struct btrfs_qgroup *member; - struct btrfs_qgroup *parent; struct btrfs_qgroup_list *list; - member = find_qgroup_rb(fs_info, memberid); - parent = find_qgroup_rb(fs_info, parentid); if (!member || !parent) return -ENOENT; @@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, return 0; } -/* must be called with qgroup_lock held */ +/* + * Add relation specified by two qgroup ids. + * + * Must be called with qgroup_lock held. + * + * Return: 0 on success + * -ENOENT if one of the ids does not exist + * <0 other errors + */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + + return __add_relation_rb(member, parent); +} + +/* Must be called with qgroup_lock held */ static int del_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) { @@ -322,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, } #endif +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +{ + fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | + BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -390,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } @@ -408,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { btrfs_err(fs_info, "inconsistent qgroup config"); - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } if (!qgroup) { qgroup = add_qgroup_rb(fs_info, found_key.offset); @@ -867,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); - btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); @@ -948,6 +967,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) */ lockdep_assert_held_write(&fs_info->subvol_sem); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "qgroups are currently unsupported in extent tree v2"); + return -EINVAL; + } + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) goto out; @@ -1035,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(leaf); @@ -1157,6 +1183,21 @@ out_add_root: fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + } else { + /* + * We have set both BTRFS_FS_QUOTA_ENABLED and + * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with + * -EINPROGRESS. That can happen because someone started the + * rescan worker by calling quota rescan ioctl before we + * attempted to initialize the rescan worker. Failure due to + * quotas disabled in the meanwhile is not possible, because + * we are holding a write lock on fs_info->subvol_sem, which + * is also acquired when disabling quotas. + * Ignore such error, and any other error would need to undo + * everything we did in the transaction we just committed. + */ + ASSERT(ret == -EINPROGRESS); + ret = 0; } out_free_path: @@ -1185,12 +1226,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans = NULL; int ret = 0; + /* + * We need to have subvol_sem write locked, to prevent races between + * concurrent tasks trying to disable quotas, because we will unlock + * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + */ + lockdep_assert_held_write(&fs_info->subvol_sem); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + + /* + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs + * to lock that mutex while holding a transaction handle and the rescan + * worker needs to commit a transaction. + */ mutex_unlock(&fs_info->qgroup_ioctl_lock); /* + * Request qgroup rescan worker to complete and wait for it. This wait + * must be done before transaction start for quota disable since it may + * deadlock with transaction by the qgroup rescan worker. + */ + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + btrfs_qgroup_wait_for_completion(fs_info, false); + + /* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in @@ -1205,18 +1268,18 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); goto out; } if (!fs_info->quota_root) goto out; - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1430,7 +1493,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = add_relation_rb(fs_info, src, dst); + ret = __add_relation_rb(member, parent); if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; @@ -1679,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, ret = update_qgroup_limit_item(trans, qgroup); if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1752,10 +1815,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); + if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + return 0; + ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, true); if (ret < 0) { - trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); @@ -2231,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2242,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; + u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; @@ -2251,8 +2318,25 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; + spin_lock(&fs_info->qgroup_lock); + drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + /* + * This function only gets called for snapshot drop, if we hit a high + * node here, it means we are going to change ownership for quite a lot + * of extents, which will greatly slow down btrfs_commit_transaction(). + * + * So here if we find a high tree here, we just skip the accounting and + * mark qgroup inconsistent. + */ + if (root_level >= drop_subptree_thres) { + qgroup_mark_inconsistent(fs_info); + return 0; + } + if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); + ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); if (ret) goto out; } @@ -2566,7 +2650,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; if (new_roots) { @@ -2662,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); - if (!ret) { + if (!ret && !(fs_info->qgroup_flags & + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { /* * Old roots should be searched when inserting qgroup * extent record @@ -2735,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -2751,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2867,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { @@ -2975,7 +3052,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -3247,7 +3324,9 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3277,11 +3356,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = PTR_ERR(trans); break; } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - err = -EINTR; - } else { - err = qgroup_rescan_leaf(trans, path); - } + + err = qgroup_rescan_leaf(trans, path); + if (err > 0) btrfs_commit_transaction(trans); else @@ -3295,7 +3372,7 @@ out: if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0) { + } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3314,7 +3391,8 @@ out: } mutex_lock(&fs_info->qgroup_rescan_lock); - if (!stopped) + if (!stopped || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { ret = update_qgroup_status_item(trans); @@ -3325,6 +3403,7 @@ out: } } fs_info->qgroup_rescan_running = false; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; complete_all(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3335,6 +3414,8 @@ out: if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { + btrfs_info(fs_info, "qgroup scan cancelled"); } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", err > 0 ? " (inconsistency flag cleared)" : ""); @@ -3383,6 +3464,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; + } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* Quota disable is in progress */ + ret = -EBUSY; } if (ret) { @@ -3394,6 +3478,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3899,12 +3985,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); - if (ret <= 0 && ret != -EDQUOT) + if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); @@ -4190,8 +4277,7 @@ out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -4278,7 +4364,7 @@ out: btrfs_err_rl(fs_info, "failed to account subtree at bytenr %llu: %d", subvol_eb->start, ret); - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 880e9df0dac1..578c77e94200 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -100,6 +100,9 @@ * subtree rescan for them. */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) + /* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. @@ -364,19 +367,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce); + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS, enforce); + BTRFS_QGROUP_RSV_META_PERTRANS, + enforce, false); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, - int num_bytes, bool enforce) + int num_bytes, bool enforce, + bool noflush) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, enforce); + BTRFS_QGROUP_RSV_META_PREALLOC, + enforce, noflush); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0e239a4c3b26..82c8e991300e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -52,132 +52,21 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct btrfs_work work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* size of each individual stripe on disk */ - int stripe_len; - - /* number of data stripes (no p/q) */ - int nr_data; - - int real_stripes; - - int stripe_npages; - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* first bad stripe */ - int faila; - - /* second bad stripe (for raid6 use) */ - int failb; - - int scrubp; - /* - * number of pages needed to represent the full - * stripe - */ - int nr_pages; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* - * pointers to the pages in the bio_list. Stored - * here for faster lookup - */ - struct page **bio_pages; - - /* - * bitmap to record which horizontal stripe has data - */ - unsigned long *dbitmap; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; - - /* allocated with stripe_npages-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; +/* + * A bvec like structure to present a sector inside a page. + * + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + */ +struct sector_ptr { + struct page *page; + unsigned int pgoff:24; + unsigned int uptodate:8; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct btrfs_work *work); -static void read_rebuild_work(struct btrfs_work *work); +static void rmw_work(struct work_struct *work); +static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void __free_raid_bio(struct btrfs_raid_bio *rbio); @@ -186,12 +75,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); -static void scrub_parity_work(struct btrfs_work *work); +static void scrub_parity_work(struct work_struct *work); -static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { - btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -239,7 +128,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) /* * caching an rbio means to copy anything from the - * bio_pages array into the stripe_pages array. We + * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * @@ -255,12 +144,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) if (ret) return; - for (i = 0; i < rbio->nr_pages; i++) { - if (!rbio->bio_pages[i]) + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (!rbio->bio_sectors[i].page) continue; - copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); - SetPageUptodate(rbio->stripe_pages[i]); + ASSERT(rbio->stripe_sectors[i].page); + memcpy_page(rbio->stripe_sectors[i].page, + rbio->stripe_sectors[i].pgoff, + rbio->bio_sectors[i].page, + rbio->bio_sectors[i].pgoff, + rbio->bioc->fs_info->sectorsize); + rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -283,32 +178,86 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + ASSERT(page_nr < rbio->nr_pages); + + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; + i++) { + if (!rbio->stripe_sectors[i].uptodate) + return false; + } + return true; +} + /* - * stealing an rbio means taking all the uptodate pages from the stripe - * array in the source rbio and putting them into the destination rbio + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + } +} + +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { s = src->stripe_pages[i]; - if (!s || !PageUptodate(s)) { + if (!s || !full_page_sectors_uptodate(src, i)) continue; - } - - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } + index_stripe_sectors(dest); + index_stripe_sectors(src); } /* @@ -323,7 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; - dest->generic_bio_cnt += victim->generic_bio_cnt; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); bio_list_init(&victim->bio_list); } @@ -522,9 +473,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -600,39 +551,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, - int index) +static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return stripe * rbio->stripe_npages + index; + ASSERT(stripe_nr < rbio->real_stripes); + ASSERT(sector_nr < rbio->stripe_nsectors); + + return stripe_nr * rbio->stripe_nsectors + sector_nr; } -/* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, - int index) +/* Return a sector from rbio->stripe_sectors, not from the bio list */ +static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, + sector_nr)]; } -/* - * helper to index into the pstripe - */ -static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside P stripe */ +static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { - return rbio_stripe_page(rbio, rbio->nr_data, index); + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } -/* - * helper to index into the qstripe, returns null - * if there is no qstripe - */ -static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside Q stripe, return NULL if not RAID6 */ +static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* @@ -862,8 +813,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; - if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -911,47 +866,43 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/* - * the read/modify/write code wants to use the original bio for - * any pages it included, and then use the rbio for everything - * else. This function decides if a given index (stripe number) - * and page number in that stripe fall inside the original bio - * or the rbio. - * - * if you set bio_list_only, you'll get a NULL back for any ranges - * that are outside the bio_list +/** + * Get a sector pointer specified by its @stripe_nr and @sector_nr * - * This doesn't take any refs on anything, you get a bare page pointer - * and the caller must bump refs as required. + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. * - * You must call index_rbio_pages once before you can trust - * the answers from this function. + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. */ -static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, - int index, int pagenr, int bio_list_only) +static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - int chunk_page; - struct page *p = NULL; + struct sector_ptr *sector; + int index; + + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock_irq(&rbio->bio_list_lock); - p = rbio->bio_pages[chunk_page]; + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; + spin_unlock_irq(&rbio->bio_list_lock); + return sector; + } spin_unlock_irq(&rbio->bio_list_lock); - if (p || bio_list_only) - return p; - - return rbio->stripe_pages[chunk_page]; -} - -/* - * number of pages we need for the entire stripe across all the - * drives - */ -static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) -{ - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; + return &rbio->stripe_sectors[index]; } /* @@ -959,23 +910,30 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u64 stripe_len) -{ + struct btrfs_io_context *bioc) +{ + const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; - int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - int num_pages = rbio_nr_pages(stripe_len, real_stripes); - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); + rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_pages) * num_pages + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + - sizeof(*rbio->finish_pbitmap) * - BITS_TO_LONGS(stripe_npages), + sizeof(*rbio->bio_sectors) * num_sectors + + sizeof(*rbio->stripe_sectors) * num_sectors + + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -985,11 +943,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); + btrfs_get_bioc(bioc); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; + rbio->stripe_nsectors = stripe_nsectors; rbio->faila = -1; rbio->failb = -1; refcount_set(&rbio->refs, 1); @@ -997,8 +957,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages, bio_pages, etc arrays point to the extra - * memory we allocated past the end of the rbio + * The stripe_pages, bio_sectors, etc arrays point to the extra memory + * we allocated past the end of the rbio. */ p = rbio + 1; #define CONSUME_ALLOC(ptr, count) do { \ @@ -1006,79 +966,76 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ } while (0) CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->bio_sectors, num_sectors); + CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + int ret; - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages); + if (ret < 0) + return ret; - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + index_stripe_sectors(rbio); return 0; } /* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error. */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) -{ +static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct sector_ptr *sector, + unsigned int stripe_nr, + unsigned int sector_nr, + enum req_op op) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT(sector->page); + stripe = &rbio->bioc->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1095,20 +1052,21 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) + ret = bio_add_page(last, sector->page, sectorsize, + sector->pgoff); + if (ret == sectorsize) return 0; } } /* put a new bio on the list */ - bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_bio(bio)->device = stripe->dev; - bio->bi_iter.bi_size = 0; - bio_set_dev(bio, stripe->dev->bdev); + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_private = rbio; - bio_add_page(bio, page, PAGE_SIZE, 0); + bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1130,6 +1088,29 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } } +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; + + for (bvec_offset = 0; bvec_offset < bvec.bv_len; + bvec_offset += sectorsize, offset += sectorsize) { + int index = offset / sectorsize; + struct sector_ptr *sector = &rbio->bio_sectors[index]; + + sector->page = bvec.bv_page; + sector->pgoff = bvec.bv_offset + bvec_offset; + ASSERT(sector->pgoff < PAGE_SIZE); + } + } +} + /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly @@ -1141,29 +1122,40 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - struct bio_vec bvec; - struct bvec_iter iter; - int i = 0; + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); - start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bioc->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; + spin_unlock_irq(&rbio->bio_list_lock); +} - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; - bio_for_each_segment(bvec, bio, iter) { - rbio->bio_pages[page_index + i] = bvec.bv_page; - i++; - } + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; } - spin_unlock_irq(&rbio->bio_list_lock); + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; } /* @@ -1177,10 +1169,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; - int pagenr; + /* Sector number inside a stripe. */ + int sectornr; bool has_qstripe; struct bio_list bio_list; struct bio *bio; @@ -1195,6 +1191,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1224,86 +1223,108 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - /* first collect one page from each data stripe */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + + /* First collect one sector from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } - /* then add the parity stripe */ - p = rbio_pstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q + * RAID6, add the qstripe and call the library function + * to fill in our p/q */ - p = rbio_qstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; + } - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bioc->tgtdev_map[stripe], - pagenr, rbio->stripe_len); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: @@ -1311,10 +1332,14 @@ write_data: BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1342,7 +1367,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1364,7 +1389,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1417,56 +1442,89 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, } /* + * For subpage case, we can no longer set page Uptodate directly for + * stripe_pages[], thus we need to locate the sector. + */ +static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, + struct page *page, + unsigned int pgoff) +{ + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + if (sector->page == page && sector->pgoff == pgoff) + return sector; + } + return NULL; +} + +/* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ -static void set_bio_pages_uptodate(struct bio *bio) +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - SetPageUptodate(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct sector_ptr *sector; + int pgoff; + + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; + pgoff += sectorsize) { + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + ASSERT(sector); + if (sector) + sector->uptodate = 1; + } + } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1477,9 +1535,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1491,36 +1549,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -1539,13 +1595,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid_rmw_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -1573,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + if (ret) return ret; - } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1624,7 +1681,7 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct btrfs_work work; + struct work_struct work; }; /* @@ -1692,7 +1749,7 @@ static void run_plug(struct btrfs_plug_cb *plug) * if the unplug comes from schedule, we have to push the * work off to a helper thread */ -static void unplug_work(struct btrfs_work *work) +static void unplug_work(struct work_struct *work) { struct btrfs_plug_cb *plug; plug = container_of(work, struct btrfs_plug_cb, work); @@ -1705,37 +1762,58 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); - btrfs_queue_work(plug->info->rmw_workers, - &plug->work); + INIT_WORK(&plug->work, unplug_work); + queue_work(plug->info->rmw_workers, &plug->work); return; } run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto fail; } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; - - btrfs_bio_counter_inc_noblocked(fs_info); - rbio->generic_bio_cnt = 1; + rbio_add_bio(rbio, bio); /* * don't plug on full rbios, just get them out the door @@ -1743,9 +1821,11 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, */ if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); - if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + if (ret) { + __free_raid_bio(rbio); + goto fail; + } + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1756,13 +1836,19 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); - if (ret) - btrfs_bio_counter_dec(fs_info); + if (ret) { + __free_raid_bio(rbio); + goto fail; + } } - return ret; + + return; + +fail: + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* @@ -1772,14 +1858,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - int pagenr, stripe; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int sectornr, stripe; void **pointers; void **unmap_array; int faila = -1, failb = -1; - struct page *page; blk_status_t err; int i; + /* + * This array stores the pointer for each sector, thus it has the extra + * pgoff value added from each sector + */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; @@ -1808,43 +1898,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(pagenr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* - * Setup our array of pointers with pages from each stripe + * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* - * if we're rebuilding a read, we have to use + * If we're rebuilding a read, we have to use * pages from the bio list */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - pointers[stripe] = kmap_local_page(page); + ASSERT(sector->page); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; unmap_array[stripe] = pointers[stripe]; } - /* all raid6 handling here */ + /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* - * single failure, rebuild from parity raid5 - * style - */ + /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) { /* @@ -1887,10 +1978,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, - PAGE_SIZE, faila, pointers); + sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, - PAGE_SIZE, faila, failb, + sectorsize, faila, failb, pointers); } } else { @@ -1900,7 +1991,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) BUG_ON(failb != -1); pstripe: /* Copy parity block into failed block to start with */ - copy_page(pointers[faila], pointers[rbio->nr_data]); + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* rearrange the pointer array */ p = pointers[faila]; @@ -1909,7 +2000,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); + run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -1918,14 +2009,14 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_npages; i++) { + for (i = 0; i < rbio->stripe_nsectors; i++) { if (faila != -1) { - page = rbio_stripe_page(rbio, faila, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; } if (failb != -1) { - page = rbio_stripe_page(rbio, failb, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } } @@ -1984,25 +2075,13 @@ cleanup_io: } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2023,8 +2102,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2036,33 +2114,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } - - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - - /* - * the rmw code may have already read this - * page in - */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) - continue; - - ret = rbio_add_io_page(rbio, &bio_list, - rbio_stripe_page(rbio, stripe, pagenr), - stripe, pagenr, rbio->stripe_len); - if (ret < 0) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2085,13 +2161,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid_recover_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2114,28 +2193,20 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; - - if (generic_io) { - ASSERT(bioc->mirror_num == mirror_num); - btrfs_bio(bio)->mirror_num = mirror_num; - } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { @@ -2143,17 +2214,9 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); - kfree(rbio); - return -EIO; - } - - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); - rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); + __free_raid_bio(rbio); + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } /* @@ -2173,27 +2236,21 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done - */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + bio_endio(bio); } -static void rmw_work(struct btrfs_work *work) +static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2201,7 +2258,7 @@ static void rmw_work(struct btrfs_work *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct btrfs_work *work) +static void read_rebuild_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2221,14 +2278,14 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2252,33 +2309,25 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - /* Now we just support the sectorsize equals to page size */ - ASSERT(fs_info->sectorsize == PAGE_SIZE); - ASSERT(rbio->stripe_npages == stripe_nsectors); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); - - /* - * We have already increased bio_counter when getting bioc, record it - * so we can free it at rbio_orig_end_io(). - */ - rbio->generic_bio_cnt = 1; - + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; } /* Used for both parity scrub and missing. */ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical) + unsigned int pgoff, u64 logical) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int stripe_offset; int index; ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset >> PAGE_SHIFT; - rbio->bio_pages[index] = page; + index = stripe_offset / sectorsize; + rbio->bio_sectors[index].page = page; + rbio->bio_sectors[index].pgoff = pgoff; } /* @@ -2287,23 +2336,25 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - int i; - int bit; - int index; - struct page *page; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int total_sector_nr; - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { - for (i = 0; i < rbio->real_stripes; i++) { - index = i * rbio->stripe_npages + bit; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } + index_stripe_sectors(rbio); return 0; } @@ -2311,14 +2362,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; - struct page *p_page = NULL; - struct page *q_page = NULL; + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; struct bio *bio; int is_replace = 0; @@ -2335,7 +2387,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2348,54 +2400,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS); - if (!p_page) + p_sector.page = alloc_page(GFP_NOFS); + if (!p_sector.page) goto cleanup; - SetPageUptodate(p_page); + p_sector.pgoff = 0; + p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS); - if (!q_page) { - __free_page(p_page); + q_sector.page = alloc_page(GFP_NOFS); + if (!q_sector.page) { + __free_page(p_sector.page); + p_sector.page = NULL; goto cleanup; } - SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); + q_sector.pgoff = 0; + q_sector.uptodate = 1; + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_page); + pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; void *parity; + /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } if (has_qstripe) { /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap_local_page(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - copy_page(parity, pointers[rbio->scrubp]); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + parity = kmap_local_page(sector->page) + sector->pgoff; + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) + memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, pagenr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2403,10 +2460,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } kunmap_local(pointers[nr_data]); - __free_page(p_page); - if (q_page) { + __free_page(p_sector.page); + p_sector.page = NULL; + if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_page); + __free_page(q_sector.page); + q_sector.page = NULL; } writeback: @@ -2415,12 +2474,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, - page, rbio->scrubp, pagenr, rbio->stripe_len); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2428,13 +2487,13 @@ writeback: if (!is_replace) goto submit_write; - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, page, + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - pagenr, rbio->stripe_len); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2450,10 +2509,14 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2541,24 +2604,14 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2568,8 +2621,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2579,36 +2631,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; + + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2627,13 +2681,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid56_parity_scrub_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2651,7 +2708,7 @@ finish: validate_rbio_for_parity_scrub(rbio); } -static void scrub_parity_work(struct btrfs_work *work) +static void scrub_parity_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2668,13 +2725,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; @@ -2688,17 +2744,13 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { - BUG(); - kfree(rbio); + btrfs_warn_rl(fs_info, + "can not determine the failed stripe number for full stripe %llu", + bioc->raid_map[0]); + __free_raid_bio(rbio); return NULL; } - /* - * When we get bioc, we have already increased bio_counter, record it - * so we can free it at rbio_orig_end_io() - */ - rbio->generic_bio_cnt = 1; - return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 72c00fc284b5..91d5c0adad15 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,46 +7,177 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} +#include <linux/workqueue.h> +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + struct work_struct end_io_work; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical); + unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index a3930da4eb3f..f50586ff85c8 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -5,6 +5,7 @@ #include "compression.h" #include "ctree.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" @@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -89,7 +92,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, NULL); + NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); - flush_dcache_page(page); } else { ret = btrfs_decompress(comp_type, data_start, page, offset_in_page(file_offset), @@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { + if (datal < block_size) memzero_page(page, datal, block_size - datal); - flush_dcache_page(page); - } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); btrfs_page_clear_checked(fs_info, page, file_offset, block_size); @@ -277,7 +277,7 @@ copy_inline_extent: path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,12 +496,14 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) goto out; - } else if (type == BTRFS_FILE_EXTENT_INLINE) { + } else { + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can @@ -505,8 +514,12 @@ process_slot: */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); - if (key.offset != 0 || datal > fs_info->sectorsize) - return -EUCLEAN; + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || + WARN_ON(key.offset != 0) || + WARN_ON(datal > fs_info->sectorsize)) { + ret = -EUCLEAN; + goto out; + } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, @@ -518,17 +531,22 @@ process_slot: btrfs_release_path(path); /* - * If this is a new extent update the last_reflink_trans of both - * inodes. This is used by fsync to make sure it does not log - * multiple checksum items with overlapping ranges. For older - * extents we don't need to do it since inode logging skips the - * checksums for older extents. Also ignore holes and inline - * extents because they don't have checksums in the csum tree. + * Whenever we share an extent we update the last_reflink_trans + * of each inode to the current transaction. This is needed to + * make sure fsync does not log multiple checksum items with + * overlapping ranges (because some extent items might refer + * only to sections of the original extent). For the destination + * inode we do this regardless of the generation of the extents + * or even if they are inline extents or explicit holes, to make + * sure a full fsync does not skip them. For the source inode, + * we only need to update last_reflink_trans in case it's a new + * extent that is not a hole or an inline extent, to deal with + * the checksums problem on fsync. */ - if (extent_gen == trans->transid && disko > 0) { + if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; - BTRFS_I(inode)->last_reflink_trans = trans->transid; - } + + BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); @@ -540,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; @@ -575,8 +593,7 @@ process_slot: * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); @@ -598,21 +615,30 @@ out: static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); } static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { + u64 range1_end = loff1 + len - 1; + u64 range2_end = loff2 + len - 1; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); + swap(range1_end, range2_end); } else if (inode1 == inode2 && loff2 < loff1) { swap(loff1, loff2); + swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); + + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) @@ -632,17 +658,20 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -730,7 +759,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); @@ -752,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -762,7 +793,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; u64 wb_len; int ret; @@ -772,9 +802,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (btrfs_root_readonly(root_out)) return -EROFS; - if (file_in->f_path.mnt != file_out->f_path.mnt || - inode_in->i_sb != inode_out->i_sb) - return -EXDEV; + ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ @@ -803,15 +831,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, wb_len = ALIGN(*len, bs); /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5465197996d..666a37a0ee89 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; + root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); @@ -1101,7 +1101,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, continue; /* - * if we are modifying block in fs tree, wait for readpage + * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { @@ -1124,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (!ret) continue; - btrfs_drop_extent_cache(BTRFS_I(inode), - key.offset, end, 1); + btrfs_drop_extent_map_range(BTRFS_I(inode), + key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, NULL); } } @@ -1326,7 +1326,9 @@ again: btrfs_release_path(path); path->lowest_level = level; + set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); path->lowest_level = 0; if (ret) { if (ret > 0) @@ -1563,10 +1565,10 @@ static int invalidate_extent_cache(struct btrfs_root *root, end = (u64)-1; } - /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + /* the lock_extent waits for read_folio to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); } return 0; } @@ -2599,9 +2601,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, block->owner, block->key.offset, block->level, NULL); - if (IS_ERR(eb)) { + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2818,7 +2820,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * Subpage can't handle page with DIRTY but without UPTODATE * bit as it can lead to the following deadlock: * - * btrfs_readpage() + * btrfs_read_folio() * | Page already *locked* * |- btrfs_lock_and_flush_ordered_range() * |- btrfs_start_ordered_extent() @@ -2867,13 +2869,13 @@ static noinline_for_stack int prealloc_file_extent_cluster( else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end); + lock_extent(&inode->io_tree, start, end, NULL); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, NULL); if (ret) break; } @@ -2888,7 +2890,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret = 0; @@ -2902,18 +2903,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); - } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + free_extent_map(em); + return ret; } @@ -2967,11 +2961,12 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; if (PageReadahead(page)) - page_cache_async_readahead(inode->i_mapping, ra, NULL, page, - page_index, last_index + 1 - page_index); + page_cache_async_readahead(inode->i_mapping, ra, NULL, + page_folio(page), page_index, + last_index + 1 - page_index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { ret = -EIO; @@ -2997,12 +2992,13 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len); + clamped_len, clamped_len, + false); if (ret) goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, NULL); if (ret) { @@ -3035,7 +3031,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3571,7 +3567,12 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - return btrfs_commit_transaction(trans); + + ret = btrfs_commit_transaction(trans); + if (ret) + unset_reloc_control(rc); + + return ret; } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -3845,8 +3846,7 @@ out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { - if (inode) - iput(inode); + iput(inode); inode = ERR_PTR(err); } return inode; @@ -3960,10 +3960,34 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + /* + * This only gets set if we had a half-deleted snapshot on mount. We + * cannot allow relocation to start while we're still trying to clean up + * these pending deletions. + */ + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); + if (ret) + return ret; + + /* We may have been woken up by close_ctree, so bail if we're closing. */ + if (btrfs_fs_closing(fs_info)) + return -EINTR; + bg = btrfs_lookup_block_group(fs_info, group_start); if (!bg) return -ENOENT; + /* + * Relocation of a data block group creates ordered extents. Without + * sb_start_write(), we can freeze the filesystem while unfinished + * ordered extents are left. Such ordered extents can cause a deadlock + * e.g. when syncfs() is waiting for their completion but they can't + * finish because they block when joining a transaction, due to the + * fact that the freeze locks are being held in write mode. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + ASSERT(sb_write_started(fs_info->sb)); + if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; @@ -4110,9 +4134,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) * this function resumes merging reloc trees with corresponding fs trees. * this is important for keeping the sharing of tree blocks */ -int btrfs_recover_relocation(struct btrfs_root *root) +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; @@ -4153,7 +4176,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_tree_root(root, &key); + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; @@ -4308,7 +4331,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0); + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3d68d2dcd83e..e1f599d7a916 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); if (btrfs_root_refs(&root->root_item) == 0) { + struct btrfs_key drop_key; + + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); + /* + * If we have a non-zero drop_progress then we know we + * made it partly through deleting this snapshot, and + * thus we need to make sure we block any balance from + * happening until this snapshot is completely dropped. + */ + if (drop_key.objectid != 0 || drop_key.type != 0 || + drop_key.offset != 0) { + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + } + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); } @@ -322,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *leaf; struct btrfs_key key; unsigned long ptr; - int err = 0; int ret; path = btrfs_alloc_path(); @@ -334,9 +348,9 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { goto out; - if (ret == 0) { + } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); @@ -344,18 +358,18 @@ again: if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name_len) || memcmp_extent_buffer(leaf, name, ptr, name_len)) { - err = -ENOENT; + ret = -ENOENT; goto out; } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); - if (ret) { - err = ret; + if (ret) goto out; - } - } else - err = -ENOENT; + } else { + ret = -ENOENT; + goto out; + } if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -367,7 +381,7 @@ again: out: btrfs_free_path(path); - return err; + return ret; } /* @@ -494,7 +508,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); + qgroup_num_bytes, true, + false); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2e9a322773f2..196c4c6ed1ed 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -45,14 +45,16 @@ struct scrub_ctx; * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ +#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ /* * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) + +#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) struct scrub_recover { refcount_t refs; @@ -60,18 +62,14 @@ struct scrub_recover { u64 map_length; }; -struct scrub_page { +struct scrub_sector { struct scrub_block *sblock; - struct page *page; - struct btrfs_device *dev; struct list_head list; u64 flags; /* extent flags */ u64 generation; - u64 logical; - u64 physical; - u64 physical_for_dev_replace; + /* Offset in bytes to @sblock. */ + u32 offset; atomic_t refs; - u8 mirror_num; unsigned int have_csum:1; unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; @@ -87,16 +85,30 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; + struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; + int sector_count; int next_free; - struct btrfs_work work; + struct work_struct work; }; struct scrub_block { - struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; + /* + * Each page will have its page::private used to record the logical + * bytenr. + */ + struct page *pages[SCRUB_MAX_PAGES]; + struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + struct btrfs_device *dev; + /* Logical bytenr of the sblock */ + u64 logical; + u64 physical; + u64 physical_for_dev_replace; + /* Length of sblock in bytes */ + u32 len; + int sector_count; + int mirror_num; + + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; @@ -110,7 +122,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; - struct btrfs_work work; + struct work_struct work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -129,21 +141,19 @@ struct scrub_parity { refcount_t refs; - struct list_head spages; + struct list_head sectors_list; /* Work of parity check and repair */ - struct btrfs_work work; + struct work_struct work; /* Mark the parity blocks which have data */ - unsigned long *dbitmap; + unsigned long dbitmap; /* * Mark the parity blocks which have data, but errors happen when * read data or check data */ - unsigned long *ebitmap; - - unsigned long bitmap[]; + unsigned long ebitmap; }; struct scrub_ctx { @@ -158,7 +168,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; + int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -204,51 +214,217 @@ struct full_stripe_lock { struct mutex mutex; }; +#ifndef CONFIG_64BIT +/* This structure is for archtectures whose (void *) is smaller than u64 */ +struct scrub_page_private { + u64 logical; +}; +#endif + +static int attach_scrub_page_private(struct page *page, u64 logical) +{ +#ifdef CONFIG_64BIT + attach_page_private(page, (void *)logical); + return 0; +#else + struct scrub_page_private *spp; + + spp = kmalloc(sizeof(*spp), GFP_KERNEL); + if (!spp) + return -ENOMEM; + spp->logical = logical; + attach_page_private(page, (void *)spp); + return 0; +#endif +} + +static void detach_scrub_page_private(struct page *page) +{ +#ifdef CONFIG_64BIT + detach_page_private(page); + return; +#else + struct scrub_page_private *spp; + + spp = detach_page_private(page); + kfree(spp); + return; +#endif +} + +static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, + struct btrfs_device *dev, + u64 logical, u64 physical, + u64 physical_for_dev_replace, + int mirror_num) +{ + struct scrub_block *sblock; + + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + if (!sblock) + return NULL; + refcount_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->logical = logical; + sblock->physical = physical; + sblock->physical_for_dev_replace = physical_for_dev_replace; + sblock->dev = dev; + sblock->mirror_num = mirror_num; + sblock->no_io_error_seen = 1; + /* + * Scrub_block::pages will be allocated at alloc_scrub_sector() when + * the corresponding page is not allocated. + */ + return sblock; +} + +/* + * Allocate a new scrub sector and attach it to @sblock. + * + * Will also allocate new pages for @sblock if needed. + */ +static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, + u64 logical, gfp_t gfp) +{ + const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; + struct scrub_sector *ssector; + + /* We must never have scrub_block exceed U32_MAX in size. */ + ASSERT(logical - sblock->logical < U32_MAX); + + ssector = kzalloc(sizeof(*ssector), gfp); + if (!ssector) + return NULL; + + /* Allocate a new page if the slot is not allocated */ + if (!sblock->pages[page_index]) { + int ret; + + sblock->pages[page_index] = alloc_page(gfp); + if (!sblock->pages[page_index]) { + kfree(ssector); + return NULL; + } + ret = attach_scrub_page_private(sblock->pages[page_index], + sblock->logical + (page_index << PAGE_SHIFT)); + if (ret < 0) { + kfree(ssector); + __free_page(sblock->pages[page_index]); + sblock->pages[page_index] = NULL; + return NULL; + } + } + + atomic_set(&ssector->refs, 1); + ssector->sblock = sblock; + /* The sector to be added should not be used */ + ASSERT(sblock->sectors[sblock->sector_count] == NULL); + ssector->offset = logical - sblock->logical; + + /* The sector count must be smaller than the limit */ + ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); + + sblock->sectors[sblock->sector_count] = ssector; + sblock->sector_count++; + sblock->len += sblock->sctx->fs_info->sectorsize; + + return ssector; +} + +static struct page *scrub_sector_get_page(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + pgoff_t index; + /* + * When calling this function, ssector must be alreaday attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + index = ssector->offset >> PAGE_SHIFT; + ASSERT(index < SCRUB_MAX_PAGES); + ASSERT(sblock->pages[index]); + ASSERT(PagePrivate(sblock->pages[index])); + return sblock->pages[index]; +} + +static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + + /* + * When calling this function, ssector must be already attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + return offset_in_page(ssector->offset); +} + +static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) +{ + return page_address(scrub_sector_get_page(ssector)) + + scrub_sector_get_page_offset(ssector); +} + +static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, + unsigned int len) +{ + return bio_add_page(bio, scrub_sector_get_page(ssector), len, + scrub_sector_get_page_offset(ssector)); +} + static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck); + struct scrub_block *sblocks_for_recheck[]); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror); static void scrub_recheck_block_checksum(struct scrub_block *sblock); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, - int page_num, int force_write); + int sector_num, int force_write); static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num); +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, + int sector_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static void scrub_page_get(struct scrub_page *spage); -static void scrub_page_put(struct scrub_page *spage); +static void scrub_sector_get(struct scrub_sector *sector); +static void scrub_sector_put(struct scrub_sector *sector); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_bio_end_io_worker(struct work_struct *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector); static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *spage) +static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) { - return spage->recover && - (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return sector->recover && + (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -535,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->page_count; i++) { - WARN_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); - } + for (i = 0; i < sbio->sector_count; i++) + scrub_block_put(sbio->sectors[i]->sblock); bio_put(sbio->bio); } @@ -572,7 +746,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; + sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -586,9 +760,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; - sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, - NULL); + sbio->sector_count = 0; + INIT_WORK(&sbio->work, scrub_bio_end_io_worker); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -728,16 +901,23 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0]->dev; + WARN_ON(sblock->sector_count < 1); + dev = sblock->dev; fs_info = sblock->sctx->fs_info; + /* Super block error, no need to search extent tree. */ + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + errstr, rcu_str_deref(dev->name), + sblock->physical); + return; + } path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->pagev[0]->physical; - swarn.logical = sblock->pagev[0]->logical; + swarn.physical = sblock->physical; + swarn.logical = sblock->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -798,8 +978,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, /* * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one + * sectors failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all sectors in the bio, even though only one * may be bad. * The goal of this function is to repair the errored block by using the * contents of one of the mirrors. @@ -807,43 +987,45 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; - struct btrfs_device *dev; + struct btrfs_device *dev = sblock_to_check->dev; struct btrfs_fs_info *fs_info; u64 logical; unsigned int failed_mirror_index; unsigned int is_metadata; unsigned int have_csum; - struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + /* One scrub_block for each mirror */ + struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; struct scrub_block *sblock_bad; int ret; int mirror_index; - int page_num; + int sector_num; int success; bool full_stripe_locked; unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - BUG_ON(sblock_to_check->page_count < 1); + BUG_ON(sblock_to_check->sector_count < 1); fs_info = sctx->fs_info; - if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* - * if we find an error in a super block, we just report it. + * If we find an error in a super block, we just report it. * They will get written with the next transaction commit * anyway */ + scrub_print_warning("super block error", sblock_to_check); spin_lock(&sctx->stat_lock); ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); return 0; } - logical = sblock_to_check->pagev[0]->logical; - BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0]->flags & + logical = sblock_to_check->logical; + ASSERT(sblock_to_check->mirror_num); + failed_mirror_index = sblock_to_check->mirror_num - 1; + is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0]->have_csum; - dev = sblock_to_check->pagev[0]->dev; + have_csum = sblock_to_check->sectors[0]->have_csum; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -854,7 +1036,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * might be waiting the scrub task to pause (which needs to wait for all * the worker tasks to complete before pausing). * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * and scrub_add_sector_to_wr_bio(), which happens down the call chain of * this function. */ nofs_flag = memalloc_nofs_save(); @@ -905,20 +1087,31 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * repaired area is verified in order to correctly maintain * the statistics. */ - - sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_KERNEL); - if (!sblocks_for_recheck) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + /* + * Note: the two members refs and outstanding_sectors are not + * used in the blocks that are used for the recheck procedure. + * + * But alloc_scrub_block() will initialize sblock::ref anyway, + * so we can use scrub_block_put() to clean them up. + * + * And here we don't setup the physical/dev for the sblock yet, + * they will be correctly initialized in scrub_setup_recheck_block(). + */ + sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, + logical, 0, 0, mirror_index); + if (!sblocks_for_recheck[mirror_index]) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } } - /* setup the context, map the logical blocks and alloc the pages */ + /* Setup the context, map the logical blocks and alloc the sectors */ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -929,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck + failed_mirror_index; + sblock_bad = sblocks_for_recheck[failed_mirror_index]; /* build and submit the bios for the failed mirror, check checksums */ scrub_recheck_block(fs_info, sblock_bad, 1); @@ -937,7 +1130,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { /* - * the error disappeared after reading page by page, or + * The error disappeared after reading sector by sector, or * the area was part of a huge bio and other parts of the * bio caused I/O errors, or the block layer merged several * read requests into one and the error is caused by a @@ -998,10 +1191,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * that is known to contain an error is rewritten. Afterwards * the block is known to be corrected. * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had + * checksum is present, only those sectors are rewritten that had * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be + * determined, which copy of the other sectors is better (and it + * could happen otherwise that a correct sector would be * overwritten by a bad one). */ for (mirror_index = 0; ;mirror_index++) { @@ -1011,25 +1204,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) continue; /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].page_count) + if (!sblocks_for_recheck[mirror_index]->sector_count) break; - sblock_other = sblocks_for_recheck + mirror_index; + sblock_other = sblocks_for_recheck[mirror_index]; } else { - struct scrub_recover *r = sblock_bad->pagev[0]->recover; + struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].page_count) + if (!sblocks_for_recheck[1]->sector_count) break; ASSERT(failed_mirror_index == 0); - sblock_other = sblocks_for_recheck + 1; - sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + sblock_other = sblocks_for_recheck[1]; + sblock_other->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1078,16 +1271,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * area are unreadable. */ success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + for (sector_num = 0; sector_num < sblock_bad->sector_count; + sector_num++) { + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; struct scrub_block *sblock_other = NULL; - /* skip no-io-error page in scrub */ - if (!spage_bad->io_error && !sctx->is_dev_replace) + /* Skip no-io-error sectors in scrub */ + if (!sector_bad->io_error && !sctx->is_dev_replace) continue; - if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { /* * In case of dev replace, if raid56 rebuild process * didn't work out correct data, then copy the content @@ -1096,16 +1289,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (spage_bad->io_error) { - /* try to find no-io-error page in mirrors */ + } else if (sector_bad->io_error) { + /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; + sblocks_for_recheck[mirror_index]->sector_count > 0; mirror_index++) { - if (!sblocks_for_recheck[mirror_index]. - pagev[page_num]->io_error) { - sblock_other = sblocks_for_recheck + - mirror_index; + if (!sblocks_for_recheck[mirror_index]-> + sectors[sector_num]->io_error) { + sblock_other = sblocks_for_recheck[mirror_index]; break; } } @@ -1115,27 +1307,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (sctx->is_dev_replace) { /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request + * Did not find a mirror to fetch the sector from. + * scrub_write_sector_to_dev_replace() handles this + * case (sector->io_error), by filling the block with + * zeros before submitting the write request */ if (!sblock_other) sblock_other = sblock_bad; - if (scrub_write_page_to_dev_replace(sblock_other, - page_num) != 0) { + if (scrub_write_sector_to_dev_replace(sblock_other, + sector_num) != 0) { atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } } else if (sblock_other) { - ret = scrub_repair_page_from_good_copy(sblock_bad, - sblock_other, - page_num, 0); + ret = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_other, + sector_num, 0); if (0 == ret) - spage_bad->io_error = 0; + sector_bad->io_error = 0; else success = 0; } @@ -1180,27 +1371,28 @@ did_not_correct_error: } out: - if (sblocks_for_recheck) { - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; - mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck + - mirror_index; - struct scrub_recover *recover; - int page_index; - - for (page_index = 0; page_index < sblock->page_count; - page_index++) { - sblock->pagev[page_index]->sblock = NULL; - recover = sblock->pagev[page_index]->recover; - if (recover) { - scrub_put_recover(fs_info, recover); - sblock->pagev[page_index]->recover = - NULL; - } - scrub_page_put(sblock->pagev[page_index]); + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; + struct scrub_recover *recover; + int sector_index; + + /* Not allocated, continue checking the next mirror */ + if (!sblock) + continue; + + for (sector_index = 0; sector_index < sblock->sector_count; + sector_index++) { + /* + * Here we just cleanup the recover, each sector will be + * properly cleaned up by later scrub_block_put() + */ + recover = sblock->sectors[sector_index]->recover; + if (recover) { + scrub_put_recover(fs_info, recover); + sblock->sectors[sector_index]->recover = NULL; } } - kfree(sblocks_for_recheck); + scrub_block_put(sblock); } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); @@ -1222,7 +1414,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1237,7 +1428,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1251,32 +1442,26 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, } static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck) + struct scrub_block *sblocks_for_recheck[]) { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * fs_info->sectorsize; - u64 logical = original_sblock->pagev[0]->logical; - u64 generation = original_sblock->pagev[0]->generation; - u64 flags = original_sblock->pagev[0]->flags; - u64 have_csum = original_sblock->pagev[0]->have_csum; + u64 logical = original_sblock->logical; + u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; + u64 generation = original_sblock->sectors[0]->generation; + u64 flags = original_sblock->sectors[0]->flags; + u64 have_csum = original_sblock->sectors[0]->have_csum; struct scrub_recover *recover; struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index = 0; + int sector_index = 0; int mirror_index; int nmirrors; int ret; - /* - * note: the two members refs and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure - */ - while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; @@ -1306,70 +1491,63 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *spage; + struct scrub_sector *sector; - sblock = sblocks_for_recheck + mirror_index; + sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - spage = kzalloc(sizeof(*spage), GFP_NOFS); - if (!spage) { -leave_nomem: + sector = alloc_scrub_sector(sblock, logical, GFP_NOFS); + if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(spage); - sblock->pagev[page_index] = spage; - spage->sblock = sblock; - spage->flags = flags; - spage->generation = generation; - spage->logical = logical; - spage->have_csum = have_csum; + sector->flags = flags; + sector->generation = generation; + sector->have_csum = have_csum; if (have_csum) - memcpy(spage->csum, - original_sblock->pagev[0]->csum, + memcpy(sector->csum, + original_sblock->sectors[0]->csum, sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); - spage->physical = bioc->stripes[stripe_index].physical + - stripe_offset; - spage->dev = bioc->stripes[stripe_index].dev; - - BUG_ON(page_index >= original_sblock->page_count); - spage->physical_for_dev_replace = - original_sblock->pagev[page_index]-> - physical_for_dev_replace; - /* for missing devices, dev->bdev is NULL */ - spage->mirror_num = mirror_index + 1; - sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) - goto leave_nomem; + /* + * We're at the first sector, also populate @sblock + * physical and dev. + */ + if (sector_index == 0) { + sblock->physical = + bioc->stripes[stripe_index].physical + + stripe_offset; + sblock->dev = bioc->stripes[stripe_index].dev; + sblock->physical_for_dev_replace = + original_sblock->physical_for_dev_replace; + } + BUG_ON(sector_index >= original_sblock->sector_count); scrub_get_recover(recover); - spage->recover = recover; + sector->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; - page_index++; + sector_index++; } return 0; @@ -1382,22 +1560,15 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *spage) + struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; - bio->bi_iter.bi_sector = spage->logical >> 9; + bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> + SECTOR_SHIFT; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(bio, spage->recover->bioc, - spage->recover->map_length, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -1406,26 +1577,24 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, struct scrub_block *sblock) { - struct scrub_page *first_page = sblock->pagev[0]; + struct scrub_sector *first_sector = sblock->sectors[0]; struct bio *bio; - int page_num; + int i; - /* All pages in sblock belong to the same stripe on the same device. */ - ASSERT(first_page->dev); - if (!first_page->dev->bdev) + /* All sectors in sblock belong to the same stripe on the same device. */ + ASSERT(sblock->dev); + if (!sblock->dev->bdev) goto out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - bio_set_dev(bio, first_page->dev->bdev); + bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!spage->page); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio_add_scrub_sector(bio, sector, fs_info->sectorsize); } - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { bio_put(bio); goto out; } @@ -1436,65 +1605,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, return; out: - for (page_num = 0; page_num < sblock->page_count; page_num++) - sblock->pagev[page_num]->io_error = 1; + for (i = 0; i < sblock->sector_count; i++) + sblock->sectors[i]->io_error = 1; sblock->no_io_error_seen = 0; } /* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. + * This function will check the on disk data for checksum errors, header errors + * and read I/O errors. If any I/O errors happen, the exact sectors which are + * errored are marked as being bad. The goal is to enable scrub to take those + * sectors that are not errored from all the mirrors so that the sectors that + * are errored in the just handled mirror can be repaired. */ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror) { - int page_num; + int i; sblock->no_io_error_seen = 1; /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) return scrub_recheck_block_on_raid56(fs_info, sblock); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; + struct bio bio; + struct bio_vec bvec; - if (spage->dev->bdev == NULL) { - spage->io_error = 1; + if (sblock->dev->bdev == NULL) { + sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!spage->page); - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage->dev->bdev); - - bio_add_page(bio, spage->page, fs_info->sectorsize, 0); - bio->bi_iter.bi_sector = spage->physical >> 9; - bio->bi_opf = REQ_OP_READ; + bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); + bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> + SECTOR_SHIFT; - if (btrfsic_submit_bio_wait(bio)) { - spage->io_error = 1; + btrfsic_check_bio(&bio); + if (submit_bio_wait(&bio)) { + sector->io_error = 1; sblock->no_io_error_seen = 0; } - bio_put(bio); + bio_uninit(&bio); } if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); } -static inline int scrub_check_fsid(u8 fsid[], - struct scrub_page *spage) +static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1507,7 +1674,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) sblock->checksum_error = 0; sblock->generation_error = 0; - if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) scrub_checksum_data(sblock); else scrub_checksum_tree_block(sblock); @@ -1516,15 +1683,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good) { - int page_num; + int i; int ret = 0; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (i = 0; i < sblock_bad->sector_count; i++) { int ret_sub; - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, 1); + ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_good, i, 1); if (ret_sub) ret = ret_sub; } @@ -1532,47 +1698,42 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, return ret; } -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int sector_num, int force_write) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; - struct scrub_page *spage_good = sblock_good->pagev[page_num]; + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; + struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(spage_bad->page == NULL); - BUG_ON(spage_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || spage_bad->io_error) { - struct bio *bio; + sblock_bad->checksum_error || sector_bad->io_error) { + struct bio bio; + struct bio_vec bvec; int ret; - if (!spage_bad->dev->bdev) { + if (!sblock_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage_bad->dev->bdev); - bio->bi_iter.bi_sector = spage_bad->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = (sblock_bad->physical + + sector_bad->offset) >> SECTOR_SHIFT; + ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); - ret = bio_add_page(bio, spage_good->page, sectorsize, 0); - if (ret != sectorsize) { - bio_put(bio); - return -EIO; - } + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); - if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(spage_bad->dev, + if (ret) { + btrfs_dev_stat_inc_and_print(sblock_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); - bio_put(bio); return -EIO; } - bio_put(bio); } return 0; @@ -1581,7 +1742,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int page_num; + int i; /* * This block is used for the check of the parity on the source device, @@ -1590,25 +1751,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) if (sblock->sparity) return; - for (page_num = 0; page_num < sblock->page_count; page_num++) { + for (i = 0; i < sblock->sector_count; i++) { int ret; - ret = scrub_write_page_to_dev_replace(sblock, page_num); + ret = scrub_write_sector_to_dev_replace(sblock, i); if (ret) atomic64_inc(&fs_info->dev_replace.num_write_errors); } } -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num) +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { - struct scrub_page *spage = sblock->pagev[page_num]; + const u32 sectorsize = sblock->sctx->fs_info->sectorsize; + struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(spage->page == NULL); - if (spage->io_error) - clear_page(page_address(spage->page)); + if (sector->io_error) + memset(scrub_sector_get_kaddr(sector), 0, sectorsize); - return scrub_add_page_to_wr_bio(sblock->sctx, spage); + return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -1633,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static void scrub_block_get(struct scrub_block *sblock) +{ + refcount_inc(&sblock->refs); +} + +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; int ret; const u32 sectorsize = sctx->fs_info->sectorsize; @@ -1650,45 +1816,39 @@ again: return -ENOMEM; } sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sector_count = 0; } sbio = sctx->wr_curr_bio; - if (sbio->page_count == 0) { - struct bio *bio; - - ret = fill_writer_pointer_gap(sctx, - spage->physical_for_dev_replace); + if (sbio->sector_count == 0) { + ret = fill_writer_pointer_gap(sctx, sector->offset + + sblock->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = spage->physical_for_dev_replace; - sbio->logical = spage->logical; + sbio->physical = sblock->physical_for_dev_replace + sector->offset; + sbio->logical = sblock->logical + sector->offset; sbio->dev = sctx->wr_tgtdev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_WRITE, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_wr_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_wr_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * sectorsize != - spage->logical) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sblock->physical_for_dev_replace + sector->offset || + sbio->logical + sbio->sector_count * sectorsize != + sblock->logical + sector->offset) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; mutex_unlock(&sctx->wr_lock); @@ -1698,10 +1858,17 @@ again: goto again; } - sbio->pagev[sbio->page_count] = spage; - scrub_page_get(spage); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + sbio->sectors[sbio->sector_count] = sector; + scrub_sector_get(sector); + /* + * Since ssector no longer holds a page, but uses sblock::pages, we + * have to ensure the sblock had not been freed before our write bio + * finished. + */ + scrub_block_get(sector->sblock); + + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1717,16 +1884,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->write_pointer = sbio->physical + sbio->sector_count * sctx->fs_info->sectorsize; } @@ -1738,31 +1905,37 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); - btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); + queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +static void scrub_wr_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; + sector->io_error = 1; atomic64_inc(&dev_replace->num_write_errors); } } - for (i = 0; i < sbio->page_count; i++) - scrub_page_put(sbio->pagev[i]); + /* + * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in + * endio we should put the sblock. + */ + for (i = 0; i < sbio->sector_count; i++) { + scrub_block_put(sbio->sectors[i]->sblock); + scrub_sector_put(sbio->sectors[i]); + } bio_put(sbio->bio); kfree(sbio); @@ -1786,15 +1959,15 @@ static int scrub_checksum(struct scrub_block *sblock) sblock->generation_error = 0; sblock->checksum_error = 0; - WARN_ON(sblock->page_count < 1); - flags = sblock->pagev[0]->flags; + WARN_ON(sblock->sector_count < 1); + flags = sblock->sectors[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = scrub_checksum_tree_block(sblock); else if (flags & BTRFS_EXTENT_FLAG_SUPER) - (void)scrub_checksum_super(sblock); + ret = scrub_checksum_super(sblock); else WARN_ON(1); if (ret) @@ -1809,26 +1982,22 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - if (!spage->have_csum) + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + if (!sector->have_csum) return 0; - kaddr = page_address(spage->page); + kaddr = scrub_sector_get_kaddr(sector); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - /* - * In scrub_pages() and scrub_pages_for_parity() we ensure each spage - * only contains one sector of data. - */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - if (memcmp(csum, spage->csum, fs_info->csum_size)) + if (memcmp(csum, sector->csum, fs_info->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; } @@ -1849,16 +2018,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) const u32 sectorsize = sctx->fs_info->sectorsize; const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); + BUG_ON(sblock->sector_count < 1); - /* Each member in pagev is just one block, not a full page */ - ASSERT(sblock->page_count == num_sectors); + /* Each member in sectors is just one sector */ + ASSERT(sblock->sector_count == num_sectors); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + sector = sblock->sectors[0]; + kaddr = scrub_sector_get_kaddr(sector); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1867,15 +2036,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (spage->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (spage->generation != btrfs_stack_header_generation(h)) { + if (sector->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, spage)) + if (!scrub_check_fsid(h->fsid, sector)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, @@ -1888,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->pagev[i]->page); + kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1906,23 +2075,23 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; int fail_gen = 0; int fail_cor = 0; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + kaddr = scrub_sector_get_kaddr(sector); s = (struct btrfs_super_block *)kaddr; - if (spage->logical != btrfs_super_bytenr(s)) + if (sblock->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (spage->generation != btrfs_super_generation(s)) + if (sector->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, spage)) + if (!scrub_check_fsid(s->fsid, sector)) ++fail_cor; shash->tfm = fs_info->csum_shash; @@ -1933,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) ++fail_cor; - if (fail_cor + fail_gen) { - /* - * if we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - spin_lock(&sctx->stat_lock); - ++sctx->stat.super_errors; - spin_unlock(&sctx->stat_lock); - if (fail_cor) - btrfs_dev_stat_inc_and_print(spage->dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - else - btrfs_dev_stat_inc_and_print(spage->dev, - BTRFS_DEV_STAT_GENERATION_ERRS); - } - return fail_cor + fail_gen; } -static void scrub_block_get(struct scrub_block *sblock) -{ - refcount_inc(&sblock->refs); -} - static void scrub_block_put(struct scrub_block *sblock) { if (refcount_dec_and_test(&sblock->refs)) { @@ -1966,24 +2113,27 @@ static void scrub_block_put(struct scrub_block *sblock) if (sblock->sparity) scrub_parity_put(sblock->sparity); - for (i = 0; i < sblock->page_count; i++) - scrub_page_put(sblock->pagev[i]); + for (i = 0; i < sblock->sector_count; i++) + scrub_sector_put(sblock->sectors[i]); + for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { + if (sblock->pages[i]) { + detach_scrub_page_private(sblock->pages[i]); + __free_page(sblock->pages[i]); + } + } kfree(sblock); } } -static void scrub_page_get(struct scrub_page *spage) +static void scrub_sector_get(struct scrub_sector *sector) { - atomic_inc(&spage->refs); + atomic_inc(§or->refs); } -static void scrub_page_put(struct scrub_page *spage) +static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(&spage->refs)) { - if (spage->page) - __free_page(spage->page); - kfree(spage); - } + if (atomic_dec_and_test(§or->refs)) + kfree(sector); } /* @@ -2057,13 +2207,14 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); } -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { - struct scrub_block *sblock = spage->sblock; + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; const u32 sectorsize = sctx->fs_info->sectorsize; int ret; @@ -2078,7 +2229,7 @@ again: if (sctx->curr != -1) { sctx->first_free = sctx->bios[sctx->curr]->next_free; sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->page_count = 0; + sctx->bios[sctx->curr]->sector_count = 0; spin_unlock(&sctx->list_lock); } else { spin_unlock(&sctx->list_lock); @@ -2086,37 +2237,31 @@ again: } } sbio = sctx->bios[sctx->curr]; - if (sbio->page_count == 0) { - struct bio *bio; - - sbio->physical = spage->physical; - sbio->logical = spage->logical; - sbio->dev = spage->dev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (sbio->sector_count == 0) { + sbio->physical = sblock->physical + sector->offset; + sbio->logical = sblock->logical + sector->offset; + sbio->dev = sblock->dev; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_READ, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_READ; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical || - sbio->logical + sbio->page_count * sectorsize != - spage->logical || - sbio->dev != spage->dev) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sblock->physical + sector->offset || + sbio->logical + sbio->sector_count * sectorsize != + sblock->logical + sector->offset || + sbio->dev != sblock->dev) { scrub_submit(sctx); goto again; } - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + sbio->sectors[sbio->sector_count] = sector; + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; return -EIO; @@ -2126,9 +2271,9 @@ again: } scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + atomic_inc(&sblock->outstanding_sectors); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_submit(sctx); return 0; @@ -2139,15 +2284,16 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; + btrfs_bio_counter_dec(fs_info); if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); - btrfs_queue_work(fs_info->scrub_workers, &sblock->work); + queue_work(fs_info->scrub_workers, &sblock->work); } -static void scrub_missing_raid56_worker(struct btrfs_work *work) +static void scrub_missing_raid56_worker(struct work_struct *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; @@ -2155,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) u64 logical; struct btrfs_device *dev; - logical = sblock->pagev[0]->logical; - dev = sblock->pagev[0]->dev; + logical = sblock->logical; + dev = sblock->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2193,8 +2339,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->page_count * PAGE_SIZE; - u64 logical = sblock->pagev[0]->logical; + u64 length = sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = sblock->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2213,30 +2359,33 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or - * there's a bug in scrub_remap_extent()/btrfs_map_block(). + * there's a bug in scrub_find_good_copy()/btrfs_map_block(). */ goto bioc_out; } - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; - for (i = 0; i < sblock->page_count; i++) { - struct scrub_page *spage = sblock->pagev[i]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - raid56_add_scrub_pages(rbio, spage->page, spage->logical); + raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), + scrub_sector_get_page_offset(sector), + sector->offset + sector->sblock->logical); } - btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); + INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); + btrfs_put_bioc(bioc); return; rbio_out: @@ -2249,7 +2398,7 @@ bioc_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) @@ -2258,7 +2407,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, const u32 sectorsize = sctx->fs_info->sectorsize; int index; - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, + physical_for_dev_replace, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2266,14 +2416,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; - for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; /* * Here we will allocate one page for one sector to scrub. * This is fine if PAGE_SIZE == sectorsize, but will cost @@ -2281,43 +2425,29 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { -leave_nomem: + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); - scrub_page_get(spage); - sblock->pagev[index] = spage; - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->physical_for_dev_replace = physical_for_dev_replace; - spage->mirror_num = mirror_num; + sector->flags = flags; + sector->generation = gen; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) - goto leave_nomem; len -= l; logical += l; physical += l; physical_for_dev_replace += l; } - WARN_ON(sblock->page_count == 0); + WARN_ON(sblock->sector_count == 0); if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See @@ -2325,11 +2455,11 @@ leave_nomem: */ scrub_missing_raid56_pages(sblock); } else { - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; @@ -2353,31 +2483,31 @@ static void scrub_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_queue_work(fs_info->scrub_workers, &sbio->work); + queue_work(fs_info->scrub_workers, &sbio->work); } -static void scrub_bio_end_io_worker(struct btrfs_work *work) +static void scrub_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; + sector->io_error = 1; + sector->sblock->no_io_error_seen = 0; } } - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; + /* Now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; + struct scrub_block *sblock = sector->sblock; - if (atomic_dec_and_test(&sblock->outstanding_pages)) + if (atomic_dec_and_test(&sblock->outstanding_sectors)) scrub_block_complete(sblock); scrub_block_put(sblock); } @@ -2428,13 +2558,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); } static void scrub_block_complete(struct scrub_block *sblock) @@ -2456,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->pagev[0]->logical; - u64 end = sblock->pagev[sblock->page_count - 1]->logical + + u64 start = sblock->logical; + u64 end = sblock->logical + + sblock->sectors[sblock->sector_count - 1]->offset + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2532,8 +2663,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u64 physical_for_dev_replace) + u64 gen, int mirror_num) { + struct btrfs_device *src_dev = dev; + u64 src_physical = physical; + int src_mirror = mirror_num; int ret; u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; @@ -2561,6 +2695,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, WARN_ON(1); } + /* + * For dev-replace case, we can have @dev being a missing device. + * Regular scrub will avoid its execution on missing device at all, + * as that would trigger tons of read error. + * + * Reading from missing device will cause read error counts to + * increase unnecessarily. + * So here we change the read source to a good mirror. + */ + if (sctx->is_dev_replace && !dev->bdev) + scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, + &src_dev, &src_mirror); while (len) { u32 l = min(len, blocksize); int have_csum = 0; @@ -2571,20 +2717,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, - physical_for_dev_replace); + ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, + flags, gen, src_mirror, + have_csum ? csum : NULL, physical); if (ret) return ret; len -= l; logical += l; physical += l; - physical_for_dev_replace += l; + src_physical += l; } return 0; } -static int scrub_pages_for_parity(struct scrub_parity *sparity, +static int scrub_sectors_for_parity(struct scrub_parity *sparity, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) @@ -2596,7 +2742,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, ASSERT(IS_ALIGNED(len, sectorsize)); - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2604,51 +2750,32 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; sblock->sparity = sparity; scrub_parity_get(sparity); for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { -leave_nomem: + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); - /* For scrub block */ - scrub_page_get(spage); - sblock->pagev[index] = spage; + sblock->sectors[index] = sector; /* For scrub parity */ - scrub_page_get(spage); - list_add_tail(&spage->list, &sparity->spages); - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; + scrub_sector_get(sector); + list_add_tail(§or->list, &sparity->sectors_list); + sector->flags = flags; + sector->generation = gen; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) - goto leave_nomem; - /* Iterate over the stripe range in sectorsize steps */ len -= sectorsize; @@ -2656,19 +2783,19 @@ leave_nomem: physical += sectorsize; } - WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + WARN_ON(sblock->sector_count == 0); + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; } } - /* last one frees, either here or in bio completion for last page */ + /* Last one frees, either here or in bio completion for last sector */ scrub_block_put(sblock); return 0; } @@ -2707,7 +2834,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, if (have_csum == 0) goto skip; } - ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL); if (ret) @@ -2767,10 +2894,10 @@ static int get_raid56_logic_offset(u64 physical, int num, static void scrub_free_parity(struct scrub_parity *sparity) { struct scrub_ctx *sctx = sparity->sctx; - struct scrub_page *curr, *next; + struct scrub_sector *curr, *next; int nbits; - nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); if (nbits) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors += nbits; @@ -2778,38 +2905,38 @@ static void scrub_free_parity(struct scrub_parity *sparity) spin_unlock(&sctx->stat_lock); } - list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { list_del_init(&curr->list); - scrub_page_put(curr); + scrub_sector_put(curr); } kfree(sparity); } -static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +static void scrub_parity_bio_endio_worker(struct work_struct *work) { struct scrub_parity *sparity = container_of(work, struct scrub_parity, work); struct scrub_ctx *sctx = sparity->sctx; + btrfs_bio_counter_dec(sctx->fs_info); scrub_free_parity(sparity); scrub_pending_bio_dec(sctx); } static void scrub_parity_bio_endio(struct bio *bio) { - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_parity *sparity = bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, - sparity->nsectors); + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, + &sparity->dbitmap, sparity->nsectors); bio_put(bio); - btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, - NULL); - btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); + INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); + queue_work(fs_info->scrub_parity_workers, &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2822,8 +2949,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) u64 length; int ret; - if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, - sparity->nsectors)) + if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, + &sparity->ebitmap, sparity->nsectors)) goto out; length = sparity->logic_end - sparity->logic_start; @@ -2834,15 +2961,16 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (ret || !bioc || !bioc->raid_map) goto bioc_out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, - sparity->dbitmap, + &sparity->dbitmap, sparity->nsectors); + btrfs_put_bioc(bioc); if (!rbio) goto rbio_out; @@ -2854,8 +2982,7 @@ rbio_out: bio_put(bio); bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2864,11 +2991,6 @@ out: scrub_free_parity(sparity); } -static inline int scrub_calc_parity_bitmap_len(int nsectors) -{ - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); -} - static void scrub_parity_get(struct scrub_parity *sparity) { refcount_inc(&sparity->refs); @@ -2882,6 +3004,251 @@ static void scrub_parity_put(struct scrub_parity *sparity) scrub_parity_check_and_repair(sparity); } +/* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. + * Return >0 if the extent item is after @start_start + @search_len. + */ +static int compare_extent_item_range(struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; + u64 len; + struct btrfs_key key; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY); + if (key.type == BTRFS_METADATA_ITEM_KEY) + len = fs_info->nodesize; + else + len = key.offset; + + if (key.objectid + len <= search_start) + return -1; + if (key.objectid >= search_start + search_len) + return 1; + return 0; +} + +/* + * Locate one extent item which covers any byte in range + * [@search_start, @search_start + @search_length) + * + * If the path is not initialized, we will initialize the search by doing + * a btrfs_search_slot(). + * If the path is already initialized, we will use the path as the initial + * slot, to avoid duplicated btrfs_search_slot() calls. + * + * NOTE: If an extent item starts before @search_start, we will still + * return the extent item. This is for data extent crossing stripe boundary. + * + * Return 0 if we found such extent item, and @path will point to the extent item. + * Return >0 if no such extent item can be found, and @path will be released. + * Return <0 if hit fatal error, and @path will be released. + */ +static int find_first_extent_item(struct btrfs_root *extent_root, + struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_key key; + int ret; + + /* Continue using the existing path */ + if (path->nodes[0]) + goto search_forward; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = search_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ASSERT(ret > 0); + /* + * Here we intentionally pass 0 as @min_objectid, as there could be + * an extent item starting before @search_start. + */ + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret < 0) + return ret; + /* + * No matter whether we have found an extent item, the next loop will + * properly do every check on the key. + */ +search_forward: + while (true) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid >= search_start + search_len) + break; + if (key.type != BTRFS_METADATA_ITEM_KEY && + key.type != BTRFS_EXTENT_ITEM_KEY) + goto next; + + ret = compare_extent_item_range(path, search_start, search_len); + if (ret == 0) + return ret; + if (ret > 0) + break; +next: + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(extent_root, path); + if (ret) { + /* Either no more item or fatal error */ + btrfs_release_path(path); + return ret; + } + } + } + btrfs_release_path(path); + return 1; +} + +static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_EXTENT_ITEM_KEY); + *extent_start_ret = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + *size_ret = path->nodes[0]->fs_info->nodesize; + else + *size_ret = key.offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); +} + +static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, + u64 boundary_start, u64 boudary_len) +{ + return (extent_start < boundary_start && + extent_start + extent_len > boundary_start) || + (extent_start < boundary_start + boudary_len && + extent_start + extent_len > boundary_start + boudary_len); +} + +static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, + struct scrub_parity *sparity, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logical) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); + u64 cur_logical = logical; + int ret; + + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + + /* Path must not be populated */ + ASSERT(!path->nodes[0]); + + while (cur_logical < logical + map->stripe_len) { + struct btrfs_io_context *bioc = NULL; + struct btrfs_device *extent_dev; + u64 extent_start; + u64 extent_size; + u64 mapped_length; + u64 extent_flags; + u64 extent_gen; + u64 extent_physical; + u64 extent_mirror_num; + + ret = find_first_extent_item(extent_root, path, cur_logical, + logical + map->stripe_len - cur_logical); + /* No more extent item in this data stripe */ + if (ret > 0) { + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(path, &extent_start, &extent_size, &extent_flags, + &extent_gen); + + /* Metadata should not cross stripe boundaries */ + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_size, + logical, map->stripe_len)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + extent_start, logical); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += extent_size; + continue; + } + + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* Truncate the range inside this data stripe */ + extent_size = min(extent_start + extent_size, + logical + map->stripe_len) - cur_logical; + extent_start = cur_logical; + ASSERT(extent_size <= U32_MAX); + + scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + + mapped_length = extent_size; + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, + &mapped_length, &bioc, 0); + if (!ret && (!bioc || mapped_length < extent_size)) + ret = -EIO; + if (ret) { + btrfs_put_bioc(bioc); + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); + + ret = btrfs_lookup_csums_range(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1, false); + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + ret = scrub_extent_for_parity(sparity, extent_start, + extent_size, extent_physical, + extent_dev, extent_flags, + extent_gen, extent_mirror_num); + scrub_free_csums(sctx); + + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + cond_resched(); + cur_logical += extent_size; + } + btrfs_release_path(path); + return ret; +} + static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, @@ -2889,28 +3256,11 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); - struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; - struct btrfs_io_context *bioc = NULL; struct btrfs_path *path; - u64 flags; + u64 cur_logical; int ret; - int slot; - struct extent_buffer *l; - struct btrfs_key key; - u64 generation; - u64 extent_logical; - u64 extent_physical; - /* Check the comment in scrub_stripe() for why u32 is enough here */ - u32 extent_len; - u64 mapped_length; - struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; - int bitmap_len; - int extent_mirror_num; - int stop_loop = 0; path = btrfs_alloc_path(); if (!path) { @@ -2924,9 +3274,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; - bitmap_len = scrub_calc_parity_bitmap_len(nsectors); - sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, - GFP_NOFS); + ASSERT(nsectors <= BITS_PER_LONG); + sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); if (!sparity) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2943,178 +3292,17 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_start = logic_start; sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->spages); - sparity->dbitmap = sparity->bitmap; - sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + INIT_LIST_HEAD(&sparity->sectors_list); ret = 0; - while (logic_start < logic_end) { - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logic_start; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + for (cur_logical = logic_start; cur_logical < logic_end; + cur_logical += map->stripe_len) { + ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, + sdev, path, cur_logical); if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logic_start) - goto next; - - if (key.objectid >= logic_end) { - stop_loop = 1; - break; - } - - while (key.objectid >= logic_start + map->stripe_len) - logic_start += map->stripe_len; - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logic_start || - key.objectid + bytes > - logic_start + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - if (extent_logical < logic_start) { - extent_len -= logic_start - extent_logical; - extent_logical = logic_start; - } - - if (extent_logical + extent_len > - logic_start + map->stripe_len) - extent_len = logic_start + map->stripe_len - - extent_logical; - - scrub_parity_mark_sectors_data(sparity, extent_logical, - extent_len); - - mapped_length = extent_len; - bioc = NULL; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bioc, - 0); - if (!ret) { - if (!bioc || mapped_length < extent_len) - ret = -EIO; - } - if (ret) { - btrfs_put_bioc(bioc); - goto out; - } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); - - csum_root = btrfs_csum_root(fs_info, extent_logical); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - - ret = scrub_extent_for_parity(sparity, extent_logical, - extent_len, - extent_physical, - extent_dev, flags, - generation, - extent_mirror_num); - - scrub_free_csums(sctx); - - if (ret) - goto out; - - if (extent_logical + extent_len < - key.objectid + bytes) { - logic_start += map->stripe_len; - - if (logic_start >= logic_end) { - stop_loop = 1; - break; - } - - if (logic_start < key.objectid + bytes) { - cond_resched(); - goto again; - } - } -next: - path->slots[0]++; - } - - btrfs_release_path(path); - - if (stop_loop) break; - - logic_start += map->stripe_len; - } -out: - if (ret < 0) { - ASSERT(logic_end - logic_start <= U32_MAX); - scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start); } + scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3165,69 +3353,234 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, return ret; } +/* + * Scrub one range which can only has simple mirror based profile. + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in + * RAID0/RAID10). + * + * Since we may need to handle a subset of block group, we need @logical_start + * and @logical_length parameter. + */ +static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, + struct btrfs_device *device, + u64 physical, int mirror_num) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + /* An artificial limit, inherit from old scrub behavior */ + const u32 max_length = SZ_64K; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + int ret; + + /* The range must be inside the bg */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + path.search_commit_root = 1; + path.skip_locking = 1; + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; + u64 scrub_len; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + break; + } + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + sctx->flush_all_writes = false; + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + ret = 0; + break; + } + spin_unlock(&bg->lock); + + ret = find_first_extent_item(extent_root, &path, cur_logical, + logical_end - cur_logical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* + * Scrub len has three limits: + * - Extent size limit + * - Scrub range limit + * This is especially imporatant for RAID0/RAID10 to reuse + * this function + * - Max scrub size limit + */ + scrub_len = min(min(extent_start + extent_len, + logical_end), cur_logical + max_length) - + cur_logical; + + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, cur_logical, + cur_logical + scrub_len - 1, + &sctx->csum_list, 1, false); + if (ret) + break; + } + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_len, + logical_start, logical_length)) { + btrfs_err(fs_info, +"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", + extent_start, logical_start, logical_end); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += scrub_len; + continue; + } + ret = scrub_extent(sctx, map, cur_logical, scrub_len, + cur_logical - logical_start + physical, + device, extent_flags, extent_gen, + mirror_num); + scrub_free_csums(sctx); + if (ret) + break; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + cur_logical += scrub_len; + /* Don't hold CPU for too long time */ + cond_resched(); + } + btrfs_release_path(&path); + return ret; +} + +/* Calculate the full stripe length for simple stripe based profiles */ +static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + + return map->num_stripes / map->sub_stripes * map->stripe_len; +} + +/* Get the logical bytenr for the stripe */ +static u64 simple_stripe_get_logical(struct map_lookup *map, + struct btrfs_block_group *bg, + int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ + return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; +} + +/* Get the mirror number for the stripe */ +static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ + return stripe_index % map->sub_stripes + 1; +} + +static int scrub_simple_stripe(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, + int stripe_index) +{ + const u64 logical_increment = simple_stripe_full_stripe_len(map); + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); + const u64 orig_physical = map->stripes[stripe_index].physical; + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); + u64 cur_logical = orig_logical; + u64 cur_physical = orig_physical; + int ret = 0; + + while (cur_logical < bg->start + bg->length) { + /* + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ + ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, + cur_logical, map->stripe_len, device, + cur_physical, mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ + cur_physical += map->stripe_len; + } + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct extent_map *em, struct btrfs_device *scrub_dev, - int stripe_index, u64 dev_extent_len) + int stripe_index) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; struct blk_plug plug; + struct map_lookup *map = em->map_lookup; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; - u64 flags; int ret; - int slot; - u64 nstripes; - struct extent_buffer *l; - u64 physical; + u64 physical = map->stripes[stripe_index].physical; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; - u64 physical_end; - u64 generation; - int mirror_num; - struct btrfs_key key; - u64 increment = map->stripe_len; + /* The logical increment after finishing one stripe */ + u64 increment; + /* Offset inside the chunk */ u64 offset; - u64 extent_logical; - u64 extent_physical; - /* - * Unlike chunk length, extent length should never go beyond - * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. - */ - u32 extent_len; u64 stripe_logical; u64 stripe_end; - struct btrfs_device *extent_dev; - int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[stripe_index].physical; - offset = 0; - nstripes = div64_u64(dev_extent_len, map->stripe_len); - mirror_num = 1; - increment = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * stripe_index; - increment = map->stripe_len * map->num_stripes; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (stripe_index / map->sub_stripes); - increment = map->stripe_len * factor; - mirror_num = stripe_index % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, stripe_index, map, &offset, - NULL); - increment = map->stripe_len * nr_data_stripes(map); - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3241,21 +3594,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path->skip_locking = 1; path->reada = READA_FORWARD; - logical = chunk_logical + offset; - physical_end = physical + nstripes * map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, stripe_index, - map, &logic_end, NULL); - logic_end += chunk_logical; - } else { - logic_end = logical + increment * nstripes; - } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, logical); - csum_root = btrfs_csum_root(fs_info, logical); + root = btrfs_extent_root(fs_info, bg->start); + csum_root = btrfs_csum_root(fs_info, bg->start); /* * collect all data csums for the stripe to avoid seeking during @@ -3272,247 +3616,89 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } /* - * now find all extents for each stripe and scrub them + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. + * + * So here we handle each profile differently, so simpler profiles + * have simpler scrubbing function. */ - ret = 0; - while (physical < physical_end) { + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* - * canceled? - */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; - goto out; - } - /* - * check to see if we have to pause + * Above check rules out all complex profile, the remaining + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple + * mirrored duplication without stripe. + * + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; - scrub_blocked_if_needed(fs_info); - } - - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, stripe_index, - map, &logical, - &stripe_logical); - logical += chunk_logical; - if (ret) { - /* it is parity strip */ - stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } - - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - /* there's no smaller item, so stick with the - * larger one */ - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logical) - goto next; - - if (key.objectid >= logical + map->stripe_len) { - /* out of this device extent */ - if (key.objectid >= logic_end) - stop_loop = 1; - break; - } - - /* - * If our block group was removed in the meanwhile, just - * stop scrubbing since there is no point in continuing. - * Continuing would prevent reusing its device extents - * for new block groups for a long time. - */ - spin_lock(&bg->lock); - if (bg->removed) { - spin_unlock(&bg->lock); - ret = 0; - goto out; - } - spin_unlock(&bg->lock); - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logical || - key.objectid + bytes > - logical + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } - -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - /* - * trim extent to this stripe - */ - if (extent_logical < logical) { - extent_len -= logical - extent_logical; - extent_logical = logical; - } - if (extent_logical + extent_len > - logical + map->stripe_len) { - extent_len = logical + map->stripe_len - - extent_logical; - } + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + bg->start, bg->length, scrub_dev, + map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, + scrub_dev, stripe_index); + offset = map->stripe_len * (stripe_index / map->sub_stripes); + goto out; + } - extent_physical = extent_logical - logical + physical; - extent_dev = scrub_dev; - extent_mirror_num = mirror_num; - if (sctx->is_dev_replace) - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - } + /* Only RAID56 goes through the old code */ + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + ret = 0; - ret = scrub_extent(sctx, map, extent_logical, extent_len, - extent_physical, extent_dev, flags, - generation, extent_mirror_num, - extent_logical - logical + physical); + /* Calculate the logical end of the stripe */ + get_raid56_logic_offset(physical_end, stripe_index, + map, &logic_end, NULL); + logic_end += chunk_logical; - scrub_free_csums(sctx); + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); + increment = map->stripe_len * nr_data_stripes(map); + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe + * using their physical offset. + */ + while (physical < physical_end) { + ret = get_raid56_logic_offset(physical, stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + stripe_logical, + stripe_end); if (ret) goto out; + goto next; + } - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - - if (extent_logical + extent_len < - key.objectid + bytes) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* - * loop until we find next data stripe - * or we have finished all stripes. - */ -loop: - physical += map->stripe_len; - ret = get_raid56_logic_offset(physical, - stripe_index, map, - &logical, &stripe_logical); - logical += chunk_logical; - - if (ret && physical < physical_end) { - stripe_logical += chunk_logical; - stripe_end = stripe_logical + - increment; - ret = scrub_raid56_parity(sctx, - map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto loop; - } - } else { - physical += map->stripe_len; - logical += increment; - } - if (logical < key.objectid + bytes) { - cond_resched(); - goto again; - } - - if (physical >= physical_end) { - stop_loop = 1; - break; - } - } + /* + * Now we're at a data stripe, scrub each extents in the range. + * + * At this stage, if we ignore the repair part, inside each data + * stripe it is no different than SINGLE profile. + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + logical, map->stripe_len, + scrub_dev, physical, 1); + if (ret < 0) + goto out; next: - path->slots[0]++; - } - btrfs_release_path(path); -skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[stripe_index].physical + - dev_extent_len; + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3566,7 +3752,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * kthread or relocation. */ spin_lock(&bg->lock); - if (!bg->removed) + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) ret = -EINVAL; spin_unlock(&bg->lock); @@ -3581,8 +3767,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, map, scrub_dev, i, - dev_extent_len); + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); if (ret) goto out; } @@ -3699,14 +3884,36 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { - spin_lock(&cache->lock); - if (!cache->to_copy) { - spin_unlock(&cache->lock); + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { btrfs_put_block_group(cache); goto skip; } - spin_unlock(&cache->lock); } /* @@ -3718,7 +3925,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * repair extents. */ spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); goto skip; @@ -3822,7 +4029,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ASSERT(cache->start == chunk_offset); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); @@ -3879,8 +4085,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * balance is triggered or it becomes used and unused again. */ spin_lock(&cache->lock); - if (!cache->removed && !cache->ro && cache->reserved == 0 && - cache->used == 0) { + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && + !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_queue_work(&fs_info->discard_ctl, @@ -3940,9 +4146,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); + ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, bytenr); if (ret) return ret; } @@ -3955,22 +4161,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; - - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + struct workqueue_struct *scrub_wr_comp = + fs_info->scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity = + fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + if (scrub_workers) + destroy_workqueue(scrub_workers); + if (scrub_wr_comp) + destroy_workqueue(scrub_wr_comp); + if (scrub_parity) + destroy_workqueue(scrub_parity); } } @@ -3980,9 +4187,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; + struct workqueue_struct *scrub_workers = NULL; + struct workqueue_struct *scrub_wr_comp = NULL; + struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -3990,18 +4197,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, - is_dev_replace ? 1 : max_active, 4); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, + is_dev_replace ? 1 : max_active); if (!scrub_workers) goto fail_scrub_workers; - scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); + scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); + scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); if (!scrub_parity) goto fail_scrub_parity_workers; @@ -4022,11 +4227,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - btrfs_destroy_workqueue(scrub_parity); + destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(scrub_wr_comp); + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(scrub_workers); + destroy_workqueue(scrub_workers); fail_scrub_workers: return ret; } @@ -4040,38 +4245,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + bool need_commit = false; if (btrfs_fs_closing(fs_info)) return -EAGAIN; - if (fs_info->nodesize > BTRFS_STRIPE_LEN) { - /* - * in this case scrub is unable to calculate the checksum - * the way scrub is implemented. Do not handle this - * situation at all because it won't ever happen. - */ - btrfs_err(fs_info, - "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", - fs_info->nodesize, - BTRFS_STRIPE_LEN); - return -EINVAL; - } + /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ + ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); - if (fs_info->nodesize > - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { - /* - * would exhaust the array bounds of pagev member in - * struct scrub_block - */ - btrfs_err(fs_info, - "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, - SCRUB_MAX_PAGES_PER_BLOCK, - fs_info->sectorsize, - SCRUB_MAX_PAGES_PER_BLOCK); - return -EINVAL; - } + /* + * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible + * value (max nodesize / min sectorsize), thus nodesize should always + * be fine. + */ + ASSERT(fs_info->nodesize <= + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); /* Allocate outside of device_list_mutex */ sctx = scrub_setup_ctx(fs_info, is_dev_replace); @@ -4137,7 +4325,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, /* * In order to avoid deadlock with reclaim when there is a transaction * trying to pause scrub, make sure we use GFP_NOFS for all the - * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() * invoked by our callees. The pausing request is done when the * transaction commit starts, and it blocks the transaction until scrub * is paused (done at specific points at scrub_stripe() or right above @@ -4145,6 +4333,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can @@ -4153,6 +4347,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); } if (!ret) @@ -4179,6 +4383,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); scrub_put_ctx(sctx); + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } return ret; out: scrub_workers_put(fs_info); @@ -4271,11 +4494,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) { u64 mapped_length; struct btrfs_io_context *bioc = NULL; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d8ccb62aa7d2..1c4b693ee4a3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -15,8 +15,10 @@ #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> +#include <linux/fsverity.h> #include "send.h" +#include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" @@ -82,8 +84,12 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; - u64 total_send_size; - u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; + struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -113,15 +119,17 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; bool ignore_cur_inode; + bool cur_inode_needs_verity; + void *verity_descriptor; u64 send_progress; @@ -132,7 +140,14 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; + /* + * The inode we are currently processing. It's not NULL only when we + * need to issue write commands for data extents from this inode. + */ + struct inode *cur_inode; struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; /* * We process inodes by their increasing order, so if before an @@ -228,6 +243,9 @@ struct send_ctx { * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; }; struct pending_dir_move { @@ -328,8 +346,9 @@ __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { - case 1: return cmd < __BTRFS_SEND_C_MAX_V1; - case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } @@ -528,17 +547,12 @@ out: static int fs_path_copy(struct fs_path *p, struct fs_path *from) { - int ret; - p->reversed = from->reversed; fs_path_reset(p); - ret = fs_path_add_path(p, from); - - return ret; + return fs_path_add_path(p, from); } - static void fs_path_unreverse(struct fs_path *p) { char *tmp; @@ -575,15 +589,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); - /* TODO handle that correctly */ - /*if (ret == -ERESTARTSYS) { - continue; - }*/ if (ret < 0) return ret; - if (ret == 0) { + if (ret == 0) return -EIO; - } pos += ret; } @@ -596,6 +605,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + if (unlikely(left < total_len)) return -EOVERFLOW; @@ -616,6 +628,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(8) +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -691,8 +705,7 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - + hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } @@ -732,9 +745,8 @@ static int send_cmd(struct send_ctx *sctx) ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); - sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; + sctx->put_data = false; return ret; } @@ -835,17 +847,32 @@ out: return ret; } +struct btrfs_inode_info { + u64 size; + u64 gen; + u64 mode; + u64 uid; + u64 gid; + u64 rdev; + u64 fileattr; + u64 nlink; +}; + /* * Helper function to retrieve some fields from an inode item. */ -static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, - u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev) +static int get_inode_info(struct btrfs_root *root, u64 ino, + struct btrfs_inode_info *info) { int ret; + struct btrfs_path *path; struct btrfs_inode_item *ii; struct btrfs_key key; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; @@ -853,41 +880,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, if (ret) { if (ret > 0) ret = -ENOENT; - return ret; + goto out; } + if (!info) + goto out; + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - if (size) - *size = btrfs_inode_size(path->nodes[0], ii); - if (gen) - *gen = btrfs_inode_generation(path->nodes[0], ii); - if (mode) - *mode = btrfs_inode_mode(path->nodes[0], ii); - if (uid) - *uid = btrfs_inode_uid(path->nodes[0], ii); - if (gid) - *gid = btrfs_inode_gid(path->nodes[0], ii); - if (rdev) - *rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->size = btrfs_inode_size(path->nodes[0], ii); + info->gen = btrfs_inode_generation(path->nodes[0], ii); + info->mode = btrfs_inode_mode(path->nodes[0], ii); + info->uid = btrfs_inode_uid(path->nodes[0], ii); + info->gid = btrfs_inode_gid(path->nodes[0], ii); + info->rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->nlink = btrfs_inode_nlink(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + info->fileattr = btrfs_inode_flags(path->nodes[0], ii); +out: + btrfs_free_path(path); return ret; } -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { - struct btrfs_path *path; int ret; + struct btrfs_inode_info info; - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev); - btrfs_free_path(path); + if (!gen) + return -EPERM; + + ret = get_inode_info(root, ino, &info); + if (!ret) + *gen = info.gen; return ret; } @@ -1630,21 +1659,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) int right_ret; u64 left_gen; u64 right_gen; + struct btrfs_inode_info info; - ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - left_ret = ret; + left_ret = (info.nlink == 0) ? -ENOENT : ret; + left_gen = info.gen; if (!sctx->parent_root) { right_ret = -ENOENT; } else { - ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - right_ret = ret; + right_ret = (info.nlink == 0) ? -ENOENT : ret; + right_gen = info.gen; } if (!left_ret && !right_ret) { @@ -1803,8 +1833,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, btrfs_release_path(path); if (dir_gen) { - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) goto out; } @@ -1861,6 +1890,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; + struct btrfs_inode_info info; if (!sctx->parent_root) goto out; @@ -1875,8 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * and we can just unlink this entry. */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1903,13 +1932,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { - ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) goto out; ret = 1; *who_ino = other_inode; + *who_gen = info.gen; + *who_mode = info.mode; } else { ret = 0; } @@ -1942,8 +1972,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1965,8 +1994,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL); + ret = get_inode_gen(sctx->send_root, ow_inode, &gen); if (ret < 0) goto out; @@ -2182,7 +2210,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in - * __record_new_ref + * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) @@ -2497,6 +2525,39 @@ out: return ret; } +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2576,7 +2637,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); - /* TODO Add otime support when the otime patches get into upstream */ + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); @@ -2598,6 +2660,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) int ret = 0; struct fs_path *p; int cmd; + struct btrfs_inode_info info; u64 gen; u64 mode; u64 rdev; @@ -2609,10 +2672,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) return -ENOMEM; if (ino != sctx->cur_ino) { - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0) goto out; + gen = info.gen; + mode = info.mode; + rdev = info.rdev; } else { gen = sctx->cur_inode_gen; mode = sctx->cur_inode_mode; @@ -2680,61 +2745,43 @@ out: static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; - struct extent_buffer *eb; struct btrfs_dir_item *di; - int slot; path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); - if (ret < 0) - goto out; - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->send_root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; - goto out; + break; } - - path->slots[0]++; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -2767,48 +2814,50 @@ struct recorded_ref { u64 dir; u64 dir_gen; int name_len; + struct rb_node node; + struct rb_root *root; }; -static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +static struct recorded_ref *recorded_ref_alloc(void) { - ref->full_path = path; - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + RB_CLEAR_NODE(&ref->node); + INIT_LIST_HEAD(&ref->list); + return ref; } -/* - * We need to process new refs before deleted refs, but compare_tree gives us - * everything mixed. So we first record all refs and later process them. - * This function is a helper to record one ref. - */ -static int __record_ref(struct list_head *head, u64 dir, - u64 dir_gen, struct fs_path *path) +static void recorded_ref_free(struct recorded_ref *ref) { - struct recorded_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) - return -ENOMEM; + return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} - ref->dir = dir; - ref->dir_gen = dir_gen; - set_ref_path(ref, path); - list_add_tail(&ref->list, head); - return 0; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; } static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_KERNEL); + new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; - new->full_path = NULL; - INIT_LIST_HEAD(&new->list); list_add_tail(&new->list, list); return 0; } @@ -2819,9 +2868,7 @@ static void __free_recorded_refs(struct list_head *head) while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(cur->full_path); - list_del(&cur->list); - kfree(cur); + recorded_ref_free(cur); } } @@ -2938,6 +2985,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 send_progress) { int ret = 0; + int iter_ret = 0; struct btrfs_root *root = sctx->parent_root; struct btrfs_path *path; struct btrfs_key key; @@ -2964,23 +3012,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (odi) key.offset = odi->last_dir_index_offset; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid || found_key.type != key.type) break; @@ -3015,8 +3049,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, ret = 0; goto out; } - - path->slots[0]++; + } + if (iter_ret < 0) { + ret = iter_ret; + goto out; } free_orphan_dir_info(sctx, odi); @@ -3341,8 +3377,7 @@ finish: /* * The parent inode might have been deleted in the send snapshot */ - ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->send_root, cur->dir, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3516,12 +3551,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) goto out; - ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3584,7 +3617,7 @@ static int check_ino_in_path(struct btrfs_root *root, } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ @@ -3596,6 +3629,7 @@ static int is_ancestor(struct btrfs_root *root, { bool free_fs_path = false; int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -3616,26 +3650,12 @@ static int is_ancestor(struct btrfs_root *root, key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (true) { + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; u32 cur_offset = 0; u32 item_size; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != ino2) break; if (key.type != BTRFS_INODE_REF_KEY && @@ -3664,8 +3684,7 @@ static int is_ancestor(struct btrfs_root *root, cur_offset = item_size; } - ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(root, parent, &parent_gen); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3673,10 +3692,12 @@ static int is_ancestor(struct btrfs_root *root, if (ret) goto out; } - path->slots[0]++; } ret = 0; - out: + if (iter_ret < 0) + ret = iter_ret; + +out: btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); @@ -3753,9 +3774,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, memcmp(path_before->start, path_after->start, len1))) { u64 parent_ino_gen; - ret = get_inode_info(sctx->parent_root, ino, NULL, - &parent_ino_gen, NULL, NULL, NULL, - NULL); + ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4349,185 +4368,167 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, - void *ctx, struct list_head *refs) +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; goto out; + } - ret = get_cur_path(sctx, dir, gen, p); + ret = get_cur_path(sctx, dir, dir_gen, path); if (ret < 0) goto out; - ret = fs_path_add_path(p, name); + ret = fs_path_add_path(path, name); if (ret < 0) goto out; - ret = __record_ref(refs, dir, gen, p); - + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; out: - if (ret) - fs_path_free(p); + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); -} - - -static int __record_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { + int ret = 0; struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, dir, name, ctx, - &sctx->deleted_refs); -} - -static int record_new_ref(struct send_ctx *sctx) -{ - int ret; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } out: return ret; } -static int record_deleted_ref(struct send_ctx *sctx) +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { - int ret; + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } out: return ret; } -struct find_ref_ctx { - u64 dir; - u64 dir_gen; - struct btrfs_root *root; - struct fs_path *name; - int found_idx; -}; - -static int __find_iref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx_) -{ - struct find_ref_ctx *ctx = ctx_; - u64 dir_gen; - int ret; - - if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && - strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { - /* - * To avoid doing extra lookups we'll only do this if everything - * else matches. - */ - ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - if (dir_gen != ctx->dir_gen) - return 0; - ctx->found_idx = num; - return 1; - } - return 0; -} - -static int find_iref(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 dir, u64 dir_gen, struct fs_path *name) +static int record_new_ref(struct send_ctx *sctx) { int ret; - struct find_ref_ctx ctx; - ctx.dir = dir; - ctx.name = name; - ctx.dir_gen = dir_gen; - ctx.found_idx = -1; - ctx.root = root; - - ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - return ret; - - if (ctx.found_idx == -1) - return -ENOENT; - - return ctx.found_idx; -} - -static int __record_changed_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_new_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + goto out; + ret = 0; +out: return ret; } -static int __record_changed_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_deleted_ref(struct send_ctx *sctx) { - u64 dir_gen; int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_deleted_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); + if (ret < 0) + goto out; + ret = 0; +out: return ret; } @@ -4536,11 +4537,11 @@ static int record_changed_ref(struct send_ctx *sctx) int ret = 0; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_changed_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4556,13 +4557,12 @@ out: static int process_all_refs(struct send_ctx *sctx, enum btrfs_compare_tree_result cmd) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; iterate_inode_ref_t cb; int pending_move = 0; @@ -4572,10 +4572,10 @@ static int process_all_refs(struct send_ctx *sctx, if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; - cb = __record_new_ref; + cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; - cb = __record_deleted_ref; + cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); @@ -4586,24 +4586,7 @@ static int process_all_refs(struct send_ctx *sctx, key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) @@ -4612,8 +4595,11 @@ static int process_all_refs(struct send_ctx *sctx, ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } btrfs_release_path(path); @@ -4875,13 +4861,12 @@ out: static int process_all_new_xattrs(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -4892,43 +4877,103 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } +static int send_verity(struct send_ctx *sctx, struct fs_path *path, + struct fsverity_descriptor *desc) +{ + int ret; + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, + le8_to_cpu(desc->hash_algorithm)); + TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, + 1U << le8_to_cpu(desc->log_blocksize)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, + le8_to_cpu(desc->salt_size)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, + le32_to_cpu(desc->sig_size)); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int process_verity(struct send_ctx *sctx) +{ + int ret = 0; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct inode *inode; + struct fs_path *p; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = btrfs_get_verity_descriptor(inode, NULL, 0); + if (ret < 0) + goto iput; + + if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + ret = -EMSGSIZE; + goto iput; + } + if (!sctx->verity_descriptor) { + sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, + GFP_KERNEL); + if (!sctx->verity_descriptor) { + ret = -ENOMEM; + goto iput; + } + } + + ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + if (ret < 0) + goto iput; + + p = fs_path_alloc(); + if (!p) { + ret = -ENOMEM; + goto iput; + } + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto free_path; + + ret = send_verity(sctx, p, sctx->verity_descriptor); + if (ret < 0) + goto free_path; + +free_path: + fs_path_free(p); +iput: + iput(inode); + return ret; +} + static inline u64 max_send_read_size(const struct send_ctx *sctx) { return sctx->send_max_size - SZ_16K; @@ -4936,14 +4981,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx) static int put_data_header(struct send_ctx *sctx, u32 len) { - struct btrfs_tlv_header *hdr; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) - return -EOVERFLOW; - hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); - put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); - put_unaligned_le16(len, &hdr->tlv_len); - sctx->send_size += sizeof(*hdr); + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } return 0; } @@ -4951,7 +5010,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; struct page *page; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; @@ -4962,43 +5020,40 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - last_index = (offset + len - 1) >> PAGE_SHIFT; - /* initial readahead */ - memset(&sctx->ra, 0, sizeof(struct file_ra_state)); - file_ra_state_init(&sctx->ra, inode->i_mapping); - while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(inode->i_mapping, index); + page = find_lock_page(sctx->cur_inode->i_mapping, index); if (!page) { - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, - NULL, index, last_index + 1 - index); + page_cache_sync_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, index, + last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - GFP_KERNEL); + page = find_or_create_page(sctx->cur_inode->i_mapping, + index, GFP_KERNEL); if (!page) { ret = -ENOMEM; break; } } - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, &sctx->ra, - NULL, page, index, last_index + 1 - index); - } + if (PageReadahead(page)) + page_cache_async_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, page_folio(page), + index, last_index + 1 - index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); + btrfs_err(fs_info, + "send: IO error at offset %llu for inode %llu root %llu", + page_offset(page), sctx->cur_ino, + sctx->send_root->root_key.objectid); put_page(page); ret = -EIO; break; @@ -5014,7 +5069,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) len -= cur_len; sctx->send_size += cur_len; } - iput(inode); + return ret; } @@ -5089,8 +5144,7 @@ static int send_clone(struct send_ctx *sctx, TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); if (clone_root->root == sctx->send_root) { - ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5217,16 +5271,250 @@ tlv_put_failure: return ret; } -static int send_extent_data(struct send_ctx *sctx, - const u64 offset, - const u64 len) +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) { + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) +{ + const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + + if (sctx->cur_inode == NULL) { + struct btrfs_root *root = sctx->send_root; + + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(sctx->cur_inode)) { + int err = PTR_ERR(sctx->cur_inode); + + sctx->cur_inode = NULL; + return err; + } + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); + } + while (sent < len) { u64 size = min(len - sent, read_size); int ret; @@ -5236,6 +5524,37 @@ static int send_extent_data(struct send_ctx *sctx, return ret; sent += size; } + + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + return 0; } @@ -5297,16 +5616,14 @@ out: return ret; } -static int clone_range(struct send_ctx *sctx, - struct clone_root *clone_root, - const u64 disk_byte, - u64 data_offset, - u64 offset, - u64 len) +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) { struct btrfs_path *path; struct btrfs_key key; int ret; + struct btrfs_inode_info info; u64 clone_src_i_size = 0; /* @@ -5326,7 +5643,7 @@ static int clone_range(struct send_ctx *sctx, */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) - return send_extent_data(sctx, offset, len); + return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) @@ -5336,11 +5653,11 @@ static int clone_range(struct send_ctx *sctx, * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) goto out; + clone_src_i_size = info.size; /* * We can't send a clone operation for the entire range if we find @@ -5385,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5423,7 +5741,8 @@ static int clone_range(struct send_ctx *sctx, if (hole_len > len) hole_len = len; - ret = send_extent_data(sctx, offset, hole_len); + ret = send_extent_data(sctx, dst_path, offset, + hole_len); if (ret < 0) goto out; @@ -5441,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5496,14 +5817,35 @@ static int clone_range(struct send_ctx *sctx, if (ret < 0) goto out; } - ret = send_extent_data(sctx, offset + slen, + ret = send_extent_data(sctx, dst_path, + offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { - ret = send_extent_data(sctx, offset, clone_len); + ret = send_extent_data(sctx, dst_path, offset, + clone_len); } if (ret < 0) @@ -5535,7 +5877,7 @@ next: } if (len > 0) - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; out: @@ -5566,10 +5908,10 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, end - offset); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); } else { - ret = send_extent_data(sctx, offset, end - offset); + ret = send_extent_data(sctx, path, offset, end - offset); } sctx->cur_inode_next_write_offset = end; return ret; @@ -5966,13 +6308,12 @@ out: static int process_all_extents(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; root = sctx->send_root; path = alloc_path_for_send(); @@ -5982,41 +6323,21 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = process_extent(sctx, path, &found_key); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -6047,14 +6368,18 @@ out: static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) { int ret = 0; + struct btrfs_inode_info info; u64 left_mode; u64 left_uid; u64 left_gid; + u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; + u64 right_fileattr; int need_chmod = 0; int need_chown = 0; + bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -6086,11 +6411,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) goto out; - - ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL); + ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); if (ret < 0) goto out; + left_mode = info.mode; + left_uid = info.uid; + left_gid = info.gid; + left_fileattr = info.fileattr; if (!sctx->parent_root || sctx->cur_inode_new) { need_chown = 1; @@ -6101,16 +6428,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } else { u64 old_size; - ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL); + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); if (ret < 0) goto out; + old_size = info.size; + right_mode = info.mode; + right_uid = info.uid; + right_gid = info.gid; + right_fileattr = info.fileattr; if (left_uid != right_uid || left_gid != right_gid) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) @@ -6154,6 +6486,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } + + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { + ret = process_verity(sctx); + if (ret < 0) + goto out; + } ret = send_capabilities(sctx); if (ret < 0) @@ -6184,91 +6529,28 @@ out: return ret; } -struct parent_paths_ctx { - struct list_head *refs; - struct send_ctx *sctx; -}; - -static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, - void *ctx) -{ - struct parent_paths_ctx *ppctx = ctx; - - return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, - ppctx->refs); -} - -/* - * Issue unlink operations for all paths of the current inode found in the - * parent snapshot. - */ -static int btrfs_unlink_all_paths(struct send_ctx *sctx) +static void close_current_inode(struct send_ctx *sctx) { - LIST_HEAD(deleted_refs); - struct btrfs_path *path; - struct btrfs_key key; - struct parent_paths_ctx ctx; - int ret; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - ctx.refs = &deleted_refs; - ctx.sctx = sctx; + u64 i_size; - while (true) { - struct extent_buffer *eb = path->nodes[0]; - int slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->parent_root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != sctx->cur_ino) - break; - if (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY) - break; - - ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, - record_parent_ref, &ctx); - if (ret < 0) - goto out; + if (sctx->cur_inode == NULL) + return; - path->slots[0]++; - } + i_size = i_size_read(sctx->cur_inode); - while (!list_empty(&deleted_refs)) { - struct recorded_ref *ref; + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); - ref = list_first_entry(&deleted_refs, struct recorded_ref, list); - ret = send_unlink(sctx, ref->full_path); - if (ret < 0) - goto out; - fs_path_free(ref->full_path); - list_del(&ref->list); - kfree(ref); - } - ret = 0; -out: - btrfs_free_path(path); - if (ret) - __free_recorded_refs(&deleted_refs); - return ret; + iput(sctx->cur_inode); + sctx->cur_inode = NULL; } static int changed_inode(struct send_ctx *sctx, @@ -6281,8 +6563,10 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; + close_current_inode(sctx); + sctx->cur_ino = key->objectid; - sctx->cur_inode_new_gen = 0; + sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; @@ -6323,7 +6607,7 @@ static int changed_inode(struct send_ctx *sctx, */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - sctx->cur_inode_new_gen = 1; + sctx->cur_inode_new_gen = true; } /* @@ -6335,28 +6619,39 @@ static int changed_inode(struct send_ctx *sctx, * file descriptor against it or turning a RO snapshot into RW mode, * keep an open file descriptor against a file, delete it and then * turn the snapshot back to RO mode before using it for a send - * operation. So if we find such cases, ignore the inode and all its - * items completely if it's a new inode, or if it's a changed inode - * make sure all its previous paths (from the parent snapshot) are all - * unlinked and all other the inode items are ignored. + * operation. The former is what the receiver operation does. + * Therefore, if we want to send these snapshots soon after they're + * received, we need to handle orphan inodes as well. Moreover, orphans + * can appear not only in the send snapshot but also in the parent + * snapshot. Here are several cases: + * + * Case 1: BTRFS_COMPARE_TREE_NEW + * | send snapshot | action + * -------------------------------- + * nlink | 0 | ignore + * + * Case 2: BTRFS_COMPARE_TREE_DELETED + * | parent snapshot | action + * ---------------------------------- + * nlink | 0 | as usual + * Note: No unlinks will be sent because there're no paths for it. + * + * Case 3: BTRFS_COMPARE_TREE_CHANGED + * | | parent snapshot | send snapshot | action + * ----------------------------------------------------------------------- + * subcase 1 | nlink | 0 | 0 | ignore + * subcase 2 | nlink | >0 | 0 | new_gen(deletion) + * subcase 3 | nlink | 0 | >0 | new_gen(creation) + * */ - if (result == BTRFS_COMPARE_TREE_NEW || - result == BTRFS_COMPARE_TREE_CHANGED) { - u32 nlinks; - - nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); - if (nlinks == 0) { + if (result == BTRFS_COMPARE_TREE_NEW) { + if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; - if (result == BTRFS_COMPARE_TREE_CHANGED) - ret = btrfs_unlink_all_paths(sctx); goto out; } - } - - if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6367,13 +6662,23 @@ static int changed_inode(struct send_ctx *sctx, ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + u32 new_nlinks, old_nlinks; + + new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); + old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); + if (new_nlinks == 0 && old_nlinks == 0) { + sctx->ignore_cur_inode = true; + goto out; + } else if (new_nlinks == 0 || old_nlinks == 0) { + sctx->cur_inode_new_gen = 1; + } /* * We need to do some special handling in case the inode was * reported as changed with a changed generation number. This @@ -6385,58 +6690,66 @@ static int changed_inode(struct send_ctx *sctx, /* * First, process the inode as if it was deleted. */ - sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; - sctx->cur_inode_size = btrfs_inode_size( - sctx->right_path->nodes[0], right_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->right_path->nodes[0], right_ii); - ret = process_all_refs(sctx, - BTRFS_COMPARE_TREE_DELETED); - if (ret < 0) - goto out; + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } /* * Now process the inode as if it was new. */ - sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; - sctx->cur_inode_size = btrfs_inode_size( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_rdev = btrfs_inode_rdev( - sctx->left_path->nodes[0], left_ii); - ret = send_create_inode_if_needed(sctx); - if (ret < 0) - goto out; + if (new_nlinks > 0) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], + left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; - ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); - if (ret < 0) - goto out; - /* - * Advance send_progress now as we did not get into - * process_recorded_refs_if_needed in the new_gen case. - */ - sctx->send_progress = sctx->cur_ino + 1; + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get + * into process_recorded_refs_if_needed in the + * new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; - /* - * Now process all extents and xattrs of the inode as if - * they were all new. - */ - ret = process_all_extents(sctx); - if (ret < 0) - goto out; - ret = process_all_new_xattrs(sctx); - if (ret < 0) - goto out; + /* + * Now process all extents and xattrs of the + * inode as if they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } } else { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_new_gen = 0; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6543,18 +6856,27 @@ static int changed_extent(struct send_ctx *sctx, return ret; } +static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + sctx->cur_inode_needs_verity = true; + } + return ret; +} + static int dir_changed(struct send_ctx *sctx, u64 dir) { u64 orig_gen, new_gen; int ret; - ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &new_gen); if (ret) return ret; - ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); if (ret) return ret; @@ -6697,6 +7019,9 @@ static int changed_cb(struct btrfs_path *left_path, ret = changed_xattr(sctx, result); else if (key->type == BTRFS_EXTENT_DATA_KEY) ret = changed_extent(sctx, result); + else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && + key->offset == 0) + ret = changed_verity(sctx, result); } out: @@ -7473,10 +7798,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) root->root_key.objectid, root->dedupe_in_progress); } -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; + struct btrfs_root *send_root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -7550,6 +7875,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) } else { sctx->proto = 1; } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { @@ -7569,8 +7898,31 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } if (!sctx->send_buf) { ret = -ENOMEM; goto out; @@ -7579,6 +7931,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, @@ -7763,10 +8117,14 @@ out: fput(sctx->send_filp); kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); + kvfree(sctx->verity_descriptor); name_cache_free(sctx); + close_current_inode(sctx); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 23bcefc84e49..f7585cfa7e52 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -7,12 +7,24 @@ #ifndef BTRFS_SEND_H #define BTRFS_SEND_H -#include "ctree.h" +#include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" -#define BTRFS_SEND_STREAM_VERSION 1 +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else +#define BTRFS_SEND_STREAM_VERSION 2 +#endif -#define BTRFS_SEND_BUF_SIZE SZ_64K +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no limit + * should be assumed. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K + +struct inode; +struct btrfs_ioctl_send_args; enum btrfs_tlv_type { BTRFS_TLV_U8, @@ -46,87 +58,126 @@ struct btrfs_tlv_header { /* commands */ enum btrfs_send_cmd { - BTRFS_SEND_C_UNSPEC, + BTRFS_SEND_C_UNSPEC = 0, /* Version 1 */ - BTRFS_SEND_C_SUBVOL, - BTRFS_SEND_C_SNAPSHOT, + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, - BTRFS_SEND_C_MKFILE, - BTRFS_SEND_C_MKDIR, - BTRFS_SEND_C_MKNOD, - BTRFS_SEND_C_MKFIFO, - BTRFS_SEND_C_MKSOCK, - BTRFS_SEND_C_SYMLINK, + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, - BTRFS_SEND_C_RENAME, - BTRFS_SEND_C_LINK, - BTRFS_SEND_C_UNLINK, - BTRFS_SEND_C_RMDIR, + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, - BTRFS_SEND_C_SET_XATTR, - BTRFS_SEND_C_REMOVE_XATTR, + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, - BTRFS_SEND_C_WRITE, - BTRFS_SEND_C_CLONE, + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, - BTRFS_SEND_C_TRUNCATE, - BTRFS_SEND_C_CHMOD, - BTRFS_SEND_C_CHOWN, - BTRFS_SEND_C_UTIMES, + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, - BTRFS_SEND_C_END, - BTRFS_SEND_C_UPDATE_EXTENT, - __BTRFS_SEND_C_MAX_V1, + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - __BTRFS_SEND_C_MAX_V2, - + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_FILEATTR = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, + + /* Version 3 */ + BTRFS_SEND_C_ENABLE_VERITY = 26, + BTRFS_SEND_C_MAX_V3 = 26, /* End */ - __BTRFS_SEND_C_MAX, + BTRFS_SEND_C_MAX = 26, }; -#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) /* attributes in send stream */ enum { - BTRFS_SEND_A_UNSPEC, - - BTRFS_SEND_A_UUID, - BTRFS_SEND_A_CTRANSID, - - BTRFS_SEND_A_INO, - BTRFS_SEND_A_SIZE, - BTRFS_SEND_A_MODE, - BTRFS_SEND_A_UID, - BTRFS_SEND_A_GID, - BTRFS_SEND_A_RDEV, - BTRFS_SEND_A_CTIME, - BTRFS_SEND_A_MTIME, - BTRFS_SEND_A_ATIME, - BTRFS_SEND_A_OTIME, - - BTRFS_SEND_A_XATTR_NAME, - BTRFS_SEND_A_XATTR_DATA, - - BTRFS_SEND_A_PATH, - BTRFS_SEND_A_PATH_TO, - BTRFS_SEND_A_PATH_LINK, - - BTRFS_SEND_A_FILE_OFFSET, - BTRFS_SEND_A_DATA, - - BTRFS_SEND_A_CLONE_UUID, - BTRFS_SEND_A_CLONE_CTRANSID, - BTRFS_SEND_A_CLONE_PATH, - BTRFS_SEND_A_CLONE_OFFSET, - BTRFS_SEND_A_CLONE_LEN, - - __BTRFS_SEND_A_MAX, + BTRFS_SEND_A_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* Version 3 */ + BTRFS_SEND_A_VERITY_ALGORITHM = 32, + BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33, + BTRFS_SEND_A_VERITY_SALT_DATA = 34, + BTRFS_SEND_A_VERITY_SIG_DATA = 35, + BTRFS_SEND_A_MAX_V3 = 35, + + __BTRFS_SEND_A_MAX = 35, }; -#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) -#ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); -#endif +long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 294242c194d8..f171bf875633 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -181,6 +182,43 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) found->full = 0; } +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return BTRFS_MAX_DATA_CHUNK_SIZE; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -202,6 +240,10 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -251,30 +293,36 @@ out: return ret; } -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) { struct btrfs_space_info *found; - int factor; + int factor, index; - factor = btrfs_bg_type_to_factor(flags); + factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, flags); + found = btrfs_find_space_info(info, block_group->flags); ASSERT(found); spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - found->bytes_zone_unusable += bytes_zone_unusable; - if (total_bytes > 0) + found->total_bytes += block_group->length; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + found->active_total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; + found->bytes_readonly += block_group->bytes_super; + found->bytes_zone_unusable += block_group->zone_unusable; + if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); - *space_info = found; + + block_group->space_info = found; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + down_write(&found->groups_sem); + list_add_tail(&block_group->list, &found->block_groups[index]); + up_write(&found->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -328,6 +376,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -340,9 +404,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -378,7 +445,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -409,28 +476,47 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) +static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + +static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info) { + const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", - info->flags, + btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", + flag_str, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", +"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly, info->bytes_zone_unusable); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - } void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -442,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&info->lock); __btrfs_dump_space_info(fs_info, info); + dump_global_block_rsv(fs_info); spin_unlock(&info->lock); if (!dump_block_groups) @@ -519,7 +606,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } - trans = (struct btrfs_trans_handle *)current->journal_info; + trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on @@ -662,6 +749,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -672,6 +771,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -709,6 +825,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -723,8 +840,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -734,9 +852,14 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + + lockdep_assert_held(&space_info->lock); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -796,8 +919,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1061,7 +1184,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv->reserved; if (block_rsv_size < space_info->bytes_may_use) delalloc_size = space_info->bytes_may_use - block_rsv_size; - spin_unlock(&space_info->lock); /* * We don't want to include the global_rsv in our calculation, @@ -1092,6 +1214,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } + spin_unlock(&space_info->lock); + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, @@ -1268,7 +1392,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still - * because we may have only satisified the priority tickets and still + * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ @@ -1513,7 +1637,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); @@ -1562,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, &space_info->priority_tickets); } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; /* * We will do the space reservation dance during log replay, * which means we won't have fs_info->fs_root set, so don't do @@ -1637,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || - flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || + flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); @@ -1649,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, } return ret; } + +/* Dump all the space infos when we abort a transaction due to ENOSPC. */ +__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + btrfs_info(fs_info, "dumping space info:"); + list_for_each_entry(space_info, &fs_info->space_info, list) { + spin_lock(&space_info->lock); + __btrfs_dump_space_info(fs_info, space_info); + spin_unlock(&space_info->lock); + } + dump_global_block_rsv(fs_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d841fed73492..ce66023a9eb8 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,8 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include "volumes.h" + struct btrfs_space_info { spinlock_t lock; @@ -17,12 +19,22 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + /* Total bytes in the space, but only accounts active block groups. */ + u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; + + /* + * Once a block group drops below this threshold (percents) we'll + * schedule it for reclaim. + */ + int bg_reclaim_threshold; int clamp; /* Used to scale our threshold for preemptive flushing. The value is >> clamp, so turns @@ -111,10 +123,10 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info); +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, @@ -145,4 +157,7 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index f429256f56db..12455b2b41de 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb, { const unsigned long member_offset = (unsigned long)ptr + off; - if (member_offset > eb->len) { + if (unlikely(member_offset + size > eb->len)) { btrfs_warn(eb->fs_info, - "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - if (member_offset + size > eb->len) { - btrfs_warn(eb->fs_info, - "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), (unsigned long)ptr, eb->start, member_offset, size); return false; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 29bd8c7a7706..9a176af847d7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,6 +63,29 @@ * This means a slightly higher tree locking latency. */ +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +{ + if (fs_info->sectorsize >= PAGE_SIZE) + return false; + + /* + * Only data pages (either through DIO or compression) can have no + * mapping. And if page->mapping->host is data inode, it's subpage. + * As we have ruled our sectorsize >= PAGE_SIZE case already. + */ + if (!page->mapping || !page->mapping->host || + is_data_inode(page->mapping->host)) + return true; + + /* + * Now the only remaining case is metadata, which we only go subpage + * routine if nodesize < PAGE_SIZE. + */ + if (fs_info->nodesize < PAGE_SIZE) + return true; + return false; +} + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { unsigned int cur = 0; @@ -100,14 +123,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* - * We have cases like a dummy extent buffer page, which is not mappped + * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (page->mapping) ASSERT(PageLocked(page)); /* Either not subpage, or the page already has private attached */ - if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* Either not subpage, or already detached */ - if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) return; - subpage = (struct btrfs_subpage *)detach_page_private(page); + subpage = detach_page_private(page); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -314,12 +337,12 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { lock_page(page); return 0; } @@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) return unlock_page(page); btrfs_subpage_clamp_range(page, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) @@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ } \ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ btrfs_subpage_clamp_range(page, &start, &len); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ @@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(!PageDirty(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -708,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages() or extent_write_full_page(). + * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * * - Page locked by lock_delalloc_pages() @@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, struct btrfs_subpage *subpage; ASSERT(PageLocked(page)); - /* For regular page size case, we just unlock the page */ - if (fs_info->sectorsize == PAGE_SIZE) + /* For non-subpage case, we just unlock the page */ + if (!btrfs_is_subpage(fs_info, page)) return unlock_page(page); ASSERT(PagePrivate(page) && page->private); @@ -736,7 +759,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers)) + if (atomic_read(&subpage->writers) == 0) /* No writers, locked by plain lock_page() */ return unlock_page(page); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7accb5c40d33..0e80ad336904 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -74,6 +74,8 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4d947ba32da9..5942b9384088 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -66,6 +67,52 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + /* * Generally the error codes correspond to their respective errors, but there * are a few special cases. @@ -128,6 +175,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; const char *errstr; #endif @@ -140,6 +188,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function #ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; va_list args; @@ -148,12 +197,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.fmt = fmt; vaf.va = &args; - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, function, line, errno, errstr, &vaf); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); va_end(args); } else { - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n", - sb->s_id, function, line, errno, errstr); + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); } #endif @@ -213,7 +262,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -240,11 +289,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.va = &args; if (__ratelimit(ratelimit)) { - if (fs_info) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info->sb->s_id, &vaf); - else - printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } } va_end(args); @@ -293,12 +346,14 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno) + unsigned int line, int errno, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, errno); WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); @@ -573,6 +628,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, int saved_compress_level; bool saved_compress_force; int no_compress = 0; + const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); @@ -711,6 +767,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -769,8 +827,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -831,8 +892,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -849,6 +913,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -861,6 +927,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_space_cache: case Opt_space_cache_version: + /* + * We already set FREE_SPACE_TREE above because we have + * compat_ro(FREE_SPACE_TREE) set, and we aren't going + * to allow v1 to be set for extent tree v2, simply + * ignore this setting if we're extent tree v2. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(info->mount_opt, @@ -873,6 +947,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -881,6 +957,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: + /* + * We cannot operate without the free space tree with + * extent tree v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; if (btrfs_test_opt(info, SPACE_CACHE)) { btrfs_clear_and_info(info, SPACE_CACHE, "disabling disk space caching"); @@ -896,6 +978,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: + /* + * We cannot clear the free space tree with extent tree + * v2, ignore this option. + */ + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) + break; btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; @@ -942,8 +1030,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -958,13 +1050,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -972,8 +1066,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -987,8 +1085,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1039,10 +1140,12 @@ out: } if (!ret) ret = btrfs_check_mountopts_zoned(info); - if (!ret && btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); + if (!ret && !remounting) { + if (btrfs_test_opt(info, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_test_opt(info, FREE_SPACE_TREE)) + btrfs_info(info, "using free space tree"); + } return ret; } @@ -1718,6 +1821,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + s->s_id); btrfs_sb(s)->bdev_holder = fs_type; if (!strstr(crc32c_impl(), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); @@ -1831,17 +1936,12 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, - new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, - new_pool_size); } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, @@ -1914,6 +2014,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) + goto restore; + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2142,12 +2246,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1C3) - num_stripes = 3; - else if (type & BTRFS_BLOCK_GROUP_RAID1C4) - num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; @@ -2171,17 +2271,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * In order to avoid overwriting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - * - * This ensures we have at least min_stripe_size free space - * after excluding 1MB. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - if (avail_space <= SZ_1M + min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; - avail_space -= SZ_1M; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -2383,6 +2479,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, { struct btrfs_ioctl_vol_args *vol; struct btrfs_device *device = NULL; + dev_t devt = 0; int ret = -ENOTTY; if (!capable(CAP_SYS_ADMIN)) @@ -2402,7 +2499,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&uuid_mutex); break; case BTRFS_IOC_FORGET_DEV: - ret = btrfs_forget_devices(vol->name); + if (vol->name[0] != 0) { + ret = lookup_bdev(vol->name, &devt); + if (ret) + break; + } + ret = btrfs_forget_devices(devt); break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); @@ -2449,11 +2551,87 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans); } +static int check_dev_super(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_super_block *sb; + u16 csum_type; + int ret = 0; + + /* This should be called with fs still frozen. */ + ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); + + /* Missing dev, no need to check. */ + if (!dev->bdev) + return 0; + + /* Only need to check the primary super block. */ + sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + + /* Btrfs_validate_super() includes fsid check against super->fsid. */ + ret = btrfs_validate_super(fs_info, sb, 0); + if (ret < 0) + goto out; + + if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", + btrfs_super_generation(sb), + fs_info->last_trans_committed); + ret = -EUCLEAN; + goto out; + } +out: + btrfs_release_disk_super(sb); + return ret; +} + static int btrfs_unfreeze(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + int ret = 0; + /* + * Make sure the fs is not changed by accident (like hibernation then + * modified by other OS). + * If we found anything wrong, we mark the fs error immediately. + * + * And since the fs is frozen, no one can modify the fs yet, thus + * we don't need to hold device_list_mutex. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + ret = check_dev_super(device); + if (ret < 0) { + btrfs_handle_fs_error(fs_info, ret, + "super block on devid %llu got modified unexpectedly", + device->devid); + break; + } + } clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); + + /* + * We still return 0, to allow VFS layer to unfreeze the fs even the + * above checks failed. Since the fs is either fine or read-only, we're + * safe to continue, without causing further damage. + */ return 0; } @@ -2561,17 +2739,21 @@ static int __init init_btrfs_fs(void) if (err) goto free_compress; - err = extent_io_init(); + err = extent_state_init_cachep(); if (err) goto free_cachep; - err = extent_state_cache_init(); + err = extent_buffer_init_cachep(); + if (err) + goto free_extent_cachep; + + err = btrfs_bioset_init(); if (err) - goto free_extent_io; + goto free_eb_cachep; err = extent_map_init(); if (err) - goto free_extent_state_cache; + goto free_bioset; err = ordered_data_init(); if (err) @@ -2593,13 +2775,9 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_end_io_wq_init(); - if (err) - goto free_prelim_ref; - err = btrfs_interface_init(); if (err) - goto free_end_io_wq; + goto free_prelim_ref; btrfs_print_mod_info(); @@ -2615,8 +2793,6 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); -free_end_io_wq: - btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: @@ -2629,10 +2805,12 @@ free_ordered_data: ordered_data_exit(); free_extent_map: extent_map_exit(); -free_extent_state_cache: - extent_state_cache_exit(); -free_extent_io: - extent_io_exit(); +free_bioset: + btrfs_bioset_exit(); +free_eb_cachep: + extent_buffer_free_cachep(); +free_extent_cachep: + extent_state_free_cachep(); free_cachep: btrfs_destroy_cachep(); free_compress: @@ -2651,10 +2829,10 @@ static void __exit exit_btrfs_fs(void) btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); - extent_state_cache_exit(); - extent_io_exit(); + btrfs_bioset_exit(); + extent_state_free_cachep(); + extent_buffer_free_cachep(); btrfs_interface_exit(); - btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index beb7f72d50b8..74fef1f49c35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "block-group.h" #include "qgroup.h" +#include "misc.h" /* * Structure name Path @@ -34,12 +35,12 @@ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> + * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug - * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard */ struct btrfs_feature_attr { @@ -61,6 +62,10 @@ struct raid_kobject { .store = _store, \ } +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) @@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { @@ -270,23 +276,25 @@ static umode_t btrfs_feature_visible(struct kobject *kobj, return mode; } -BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); -BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -/* Remove once support for zoned allocation is feature complete */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif +#ifdef CONFIG_BTRFS_DEBUG +/* Remove once support for extent tree v2 is feature complete */ +BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +#endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); #endif @@ -294,17 +302,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); /* * Features which depend on feature bits and may differ between each fs. * - * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { - BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), - BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), @@ -312,9 +318,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), -#ifdef CONFIG_BTRFS_DEBUG + BTRFS_FEAT_ATTR_PTR(block_group_tree), +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), +#endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), #endif @@ -391,11 +401,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* 4K sector size is also supported with 64K page size */ - if (PAGE_SIZE == SZ_64K) + /* An artificial limit to only support 4K and PAGE_SIZE */ + if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - - /* Only sectorsize == PAGE_SIZE is now supported */ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; @@ -423,12 +431,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; -#ifdef CONFIG_BTRFS_DEBUG - /* * Discard statistics and tunables */ -#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent) +#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, @@ -577,11 +583,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* - * Per-filesystem debugging of discard (when mounted with discard=async). + * Per-filesystem stats for discard (when mounted with discard=async). * - * Path: /sys/fs/btrfs/<uuid>/debug/discard/ + * Path: /sys/fs/btrfs/<uuid>/discard/ */ -static const struct attribute *discard_debug_attrs[] = { +static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), @@ -593,6 +599,8 @@ static const struct attribute *discard_debug_attrs[] = { NULL, }; +#ifdef CONFIG_BTRFS_DEBUG + /* * Per-filesystem runtime debugging exported via sysfs. * @@ -708,6 +716,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -718,6 +832,40 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); +} + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh < 0 || thresh > 100) + return -EINVAL; + + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); + + return len; +} + +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, + btrfs_sinfo_bg_reclaim_threshold_show, + btrfs_sinfo_bg_reclaim_threshold_store); /* * Allocation information about block group types. @@ -735,6 +883,11 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -833,6 +986,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -919,6 +1114,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; @@ -951,25 +1149,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, } BTRFS_ATTR(, generation, btrfs_generation_show); -/* - * Look for an exact string @string in @buffer with possible leading or - * trailing whitespace - */ -static bool strmatch(const char *buffer, const char *string) -{ - const size_t len = strlen(string); - - /* Skip leading whitespace */ - buffer = skip_spaces(buffer); - - /* Match entire string, check if the rest is whitespace or empty */ - if (strncmp(string, buffer, len) == 0 && - strlen(skip_spaces(buffer + len)) == 0) - return true; - - return false; -} - static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1003,7 +1182,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (strmatch(buf, btrfs_read_policy_name[i])) { + if (sysfs_streq(buf, btrfs_read_policy_name[i])) { if (i != fs_devices->read_policy) { fs_devices->read_policy = i; btrfs_info(fs_devices->fs_info, @@ -1023,11 +1202,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - ssize_t ret; - ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); - - return ret; + return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, @@ -1069,6 +1245,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), NULL, }; @@ -1099,11 +1276,26 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) return to_fs_devs(kobj)->fs_info; } +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == + ARRAY_SIZE(btrfs_feature_attrs)); +static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == + ARRAY_SIZE(btrfs_feature_attrs[0])); + static const u64 supported_feature_masks[FEAT_MAX] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, @@ -1212,13 +1404,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } -#ifdef CONFIG_BTRFS_DEBUG - if (fs_info->discard_debug_kobj) { - sysfs_remove_files(fs_info->discard_debug_kobj, - discard_debug_attrs); - kobject_del(fs_info->discard_debug_kobj); - kobject_put(fs_info->discard_debug_kobj); + if (fs_info->discard_kobj) { + sysfs_remove_files(fs_info->discard_kobj, discard_attrs); + kobject_del(fs_info->discard_kobj); + kobject_put(fs_info->discard_kobj); } +#ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); @@ -1272,11 +1463,6 @@ static void init_feature_attrs(void) struct btrfs_feature_attr *fa; int set, i; - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != - ARRAY_SIZE(btrfs_feature_attrs)); - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != - ARRAY_SIZE(btrfs_feature_attrs[0])); - memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); memset(btrfs_unknown_feature_names, 0, sizeof(btrfs_unknown_feature_names)); @@ -1791,20 +1977,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (error) goto failure; +#endif /* Discard directory */ - fs_info->discard_debug_kobj = kobject_create_and_add("discard", - fs_info->debug_kobj); - if (!fs_info->discard_debug_kobj) { + fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); + if (!fs_info->discard_kobj) { error = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_debug_kobj, - discard_debug_attrs); + error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (error) goto failure; -#endif error = addrm_unknown_feature_attrs(fs_info, true); if (error) @@ -1831,6 +2015,98 @@ failure: return error; } +static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool enabled; + + spin_lock(&fs_info->qgroup_lock); + enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", enabled); +} +BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); + +static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool inconsistent; + + spin_lock(&fs_info->qgroup_lock); + inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", inconsistent); +} +BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); + +static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 result; + + spin_lock(&fs_info->qgroup_lock); + result = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", result); +} + +static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 new_thres; + int ret; + + ret = kstrtou8(buf, 10, &new_thres); + if (ret) + return -EINVAL; + + if (new_thres > BTRFS_MAX_LEVEL) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + fs_info->qgroup_drop_subtree_thres = new_thres; + spin_unlock(&fs_info->qgroup_lock); + + return len; +} +BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, + qgroup_drop_subtree_thres_store); + +/* + * Qgroups global info + * + * Path: /sys/fs/btrfs/<uuid>/qgroups/ + */ +static struct attribute *qgroups_attrs[] = { + BTRFS_ATTR_PTR(qgroups, enabled), + BTRFS_ATTR_PTR(qgroups, inconsistent), + BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + NULL +}; +ATTRIBUTE_GROUPS(qgroups); + +static void qgroups_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static struct kobj_type qgroups_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = qgroups_groups, + .release = qgroups_release, +}; + static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); @@ -1956,11 +2232,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) if (fs_info->qgroups_kobj) return 0; - fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); - if (!fs_info->qgroups_kobj) { - ret = -ENOMEM; + fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!fs_info->qgroups_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, + fsid_kobj, "qgroups"); + if (ret < 0) goto out; - } + rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); @@ -2041,8 +2321,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; @@ -2065,4 +2348,3 @@ void __cold btrfs_exit_sysfs(void) #endif kset_unregister(btrfs_kset); } - diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d8e56edd6991..d43cb5242fec 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; @@ -199,7 +200,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void btrfs_free_dummy_root(struct btrfs_root *root) { - if (!root) + if (IS_ERR_OR_NULL(root)) return; /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) @@ -242,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) { if (!cache) return; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); kfree(cache->free_space_ctl); kfree(cache); } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 51a8b075c259..b7d181a08eab 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a232b15b8021..350da449db08 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sizes.h> @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } @@ -80,7 +82,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) PRINT_ONE_FLAG(state, dest, cur, NODATASUM); PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); - PRINT_ONE_FLAG(state, dest, cur, DAMAGED); PRINT_ONE_FLAG(state, dest, cur, NORESERVE); PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); @@ -172,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 319fed82d741..c5b3a631bf4f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) struct extent_map *em; struct rb_node *node; + write_lock(&em_tree->lock); while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); @@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #endif free_extent_map(em); } + write_unlock(&em_tree->lock); } /* diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 5930cdcae5cb..ebf68fcd2149 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache) } /* Cleanup */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* Now with the extent entry offset into the bitmap */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, * [ bitmap ] * [ del ] */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_err("couldn't add bitmap %d", ret); @@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * This blew up before, we have part of the free space in a bitmap and @@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return ret; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, if (ret) return ret; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * Now test a similar scenario, but where our extent entry is located @@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return ret; cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps do the correct thing. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); for (i = 0; i < 2; i++) { offset = i * BITS_PER_BITMAP * sectorsize; bytes = (i + 1) * SZ_1M; @@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps with different ->max_extent_size. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); orig_free_space_ops = cache->free_space_ctl->op; cache->free_space_ctl->op = &test_free_space_ops; @@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index cac89c388131..625f7d398368 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -267,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * All of the magic numbers are based on the mapping setup in @@ -975,7 +975,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1043,7 +1043,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1076,7 +1076,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1092,7 +1092,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index eee1e4459541..63676ea19f29 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -225,20 +225,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root, */ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -250,29 +250,31 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { test_err("qgroup counts didn't match expected values"); return -EINVAL; } - old_roots = NULL; - new_roots = NULL; ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_item(root, nodesize, nodesize); - if (ret) + if (ret) { + ulist_free(old_roots); return -EINVAL; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -322,20 +324,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -355,20 +357,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -394,20 +396,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03de89b45f27..d1f1da6820fb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -160,7 +161,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; - struct btrfs_caching_control *caching_ctl, *next; /* * At this point no one can be using this transaction to modify any tree @@ -195,46 +195,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) } spin_unlock(&cur_trans->dropped_roots_lock); - /* - * We have to update the last_byte_to_unpin under the commit_root_sem, - * at the same time we swap out the commit roots. - * - * This is because we must have a real view of the last spot the caching - * kthreads were while caching. Consider the following views of the - * extent tree for a block group - * - * commit root - * +----+----+----+----+----+----+----+ - * |\\\\| |\\\\|\\\\| |\\\\|\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * new commit root - * +----+----+----+----+----+----+----+ - * | | | |\\\\| | |\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * If the cache_ctl->progress was at 3, then we are only allowed to - * unpin [0,1) and [2,3], because the caching thread has already - * processed those extents. We are not allowed to unpin [5,6), because - * the caching thread will re-start it's search from 3, and thus find - * the hole from [4,6) to add to the free space cache. - */ - spin_lock(&fs_info->block_group_cache_lock); - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - struct btrfs_block_group *cache = caching_ctl->block_group; - - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - spin_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -312,6 +272,8 @@ loop: atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); return 0; } spin_unlock(&fs_info->trans_lock); @@ -333,16 +295,23 @@ loop: if (!cur_trans) return -ENOMEM; + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); + spin_lock(&fs_info->trans_lock); if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the checks above */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); return -EROFS; } @@ -396,7 +365,7 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + IO_TREE_TRANS_DIRTY_PAGES, NULL); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS, NULL); fs_info->generation++; @@ -540,6 +509,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || TRANS_ABORTED(cur_trans)); @@ -624,7 +594,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (flush == BTRFS_RESERVE_FLUSH_ALL && - delayed_refs_rsv->full == 0) { + btrfs_block_rsv_full(delayed_refs_rsv) == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; } @@ -649,7 +619,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && - !delayed_refs_rsv->full) { + !btrfs_block_rsv_full(delayed_refs_rsv)) { /* * Some people call with btrfs_start_transaction(root, 0) * because they can be throttled, but have some other mechanism @@ -854,7 +824,46 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + /* + * At the moment this function is called with min_state either being + * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. + */ + if (min_state == TRANS_STATE_COMPLETED) + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + else + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -991,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, extwriter_counter_dec(cur_trans, trans->type); cond_wake_up(&cur_trans->writer_wait); + + btrfs_lockdep_release(info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(info, btrfs_trans_num_writers); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) @@ -1103,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, 0, 0, &cached_state); + EXTENT_NEED_WAIT, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1320,6 +1333,32 @@ again: } /* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + +/* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted @@ -1331,7 +1370,12 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } @@ -1770,8 +1814,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); + parent_inode->i_mtime = current_time(parent_inode); + parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1897,6 +1941,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -1924,6 +1969,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has already released the lockdep map as reader + * already in btrfs_commit_transaction(). + */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -1981,16 +2032,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* - * We use writeback_inodes_sb here because if we used + * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } @@ -2000,20 +2059,52 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; + ktime_t start_time; + ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - btrfs_end_transaction(trans); - return ret; + goto lockdep_trans_commit_start_release; } btrfs_trans_release_metadata(trans); @@ -2030,10 +2121,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Any running threads may add more while we are here. */ ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } btrfs_create_pending_block_groups(trans); @@ -2062,10 +2151,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (run_it) { ret = btrfs_start_dirty_block_groups(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } } @@ -2073,11 +2160,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (cur_trans->state >= TRANS_STATE_COMMIT_START) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + add_pending_snapshot(trans); + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; + + btrfs_trans_state_lockdep_release(fs_info, + BTRFS_LOCKDEP_TRANS_COMMIT_START); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2091,6 +2183,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2110,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; } else { spin_unlock(&fs_info->trans_lock); } @@ -2124,27 +2217,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; - goto cleanup_transaction; + goto lockdep_release; } } + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); if (ret) - goto cleanup_transaction; + goto lockdep_release; ret = btrfs_run_delayed_items(trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); wait_event(cur_trans->writer_wait, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); - if (ret) + if (ret) { + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; + } btrfs_wait_delalloc_flush(fs_info); @@ -2153,6 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * transaction. Otherwise if this transaction commits before the ordered * extents complete we lose logged data after a power failure. */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); wait_event(cur_trans->pending_wait, atomic_read(&cur_trans->pending_ordered) == 0); @@ -2163,12 +2272,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); /* + * Make lockdep happy by acquiring the state locks after + * btrfs_trans_num_writers is released. If we acquired the state locks + * before releasing the btrfs_trans_num_writers lock then lockdep would + * complain because we did not follow the reverse order unlocking rule. + */ + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + + /* * We've started the commit, clear the flag in case we were triggered to * do an async commit but somebody else started before the transaction * kthread could do the work. @@ -2177,6 +2305,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); goto scrub_continue; } /* @@ -2269,6 +2398,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_set_root_node(&fs_info->block_group_root->root_item, + fs_info->block_group_root->node); + list_add_tail(&fs_info->block_group_root->dirty_list, + &cur_trans->switch_commits); + } + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); @@ -2304,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->reloc_mutex); wake_up(&fs_info->transaction_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); ret = btrfs_write_and_wait_transaction(trans); if (ret) { @@ -2335,6 +2472,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_SUPER_COMMITTED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_finish_extent_commit(trans); @@ -2348,6 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); @@ -2361,6 +2500,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); + interval = ktime_get_ns() - start_time; + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2368,11 +2509,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); + update_commit_stats(fs_info, interval); + return ret; unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); scrub_continue: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); @@ -2385,6 +2531,16 @@ cleanup_transaction: cleanup_transaction(trans, ret); return ret; + +lockdep_release: + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + goto cleanup_transaction; + +lockdep_trans_commit_start_release: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_end_transaction(trans); + return ret; } /* @@ -2397,10 +2553,10 @@ cleanup_transaction: * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root; int ret; - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1852ed9de7fd..970ff316069d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,6 +123,8 @@ struct btrfs_trans_handle { struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + /* Set by a task that wants to create a snapshot. */ + struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* @@ -214,7 +216,8 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); +int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 72e1c942197d..43f905ab0a18 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot, static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size(leaf, slot); + u64 chunk_objectid; u64 flags; u64 type; @@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + /* + * We don't init the nr_global_roots until we load the global + * roots, so this could be 0 at mount time. If it's 0 we'll + * just assume we're fine, and later we'll check against our + * actual value. + */ + if (unlikely(fs_info->nr_global_roots && + chunk_objectid >= fs_info->nr_global_roots)) { + block_group_err(leaf, slot, + "invalid block group global root id, have %llu, needs to be <= %llu", + chunk_objectid, + fs_info->nr_global_roots); + return -EUCLEAN; + } + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -965,6 +982,7 @@ static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; + const u32 item_size = btrfs_item_size(leaf, slot); if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, @@ -972,6 +990,13 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } + + if (unlikely(item_size != sizeof(*ditem))) { + dev_item_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*ditem)); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, @@ -1007,6 +1032,7 @@ static int check_inode_item(struct extent_buffer *leaf, struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + const u32 item_size = btrfs_item_size(leaf, slot); u32 mode; int ret; u32 flags; @@ -1016,6 +1042,12 @@ static int check_inode_item(struct extent_buffer *leaf, if (unlikely(ret < 0)) return ret; + if (unlikely(item_size != sizeof(*iitem))) { + generic_err(leaf, slot, "invalid item size: has %u expect %zu", + item_size, sizeof(*iitem)); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ @@ -1201,7 +1233,8 @@ static void extent_err(const struct extent_buffer *eb, int slot, } static int check_extent_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot) + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; @@ -1421,6 +1454,26 @@ static int check_extent_item(struct extent_buffer *leaf, total_refs, inline_refs); return -EUCLEAN; } + + if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) || + (prev_key->type == BTRFS_METADATA_ITEM_KEY)) { + u64 prev_end = prev_key->objectid; + + if (prev_key->type == BTRFS_METADATA_ITEM_KEY) + prev_end += fs_info->nodesize; + else + prev_end += prev_key->offset; + + if (unlikely(prev_end > key->objectid)) { + extent_err(leaf, slot, + "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", + prev_key->objectid, prev_key->type, + prev_key->offset, key->objectid, key->type, + key->offset); + return -EUCLEAN; + } + } + return 0; } @@ -1589,7 +1642,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_EXTENT_ITEM_KEY: case BTRFS_METADATA_ITEM_KEY: - ret = check_extent_item(leaf, key, slot); + ret = check_extent_item(leaf, key, slot, prev_key); break; case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_DATA_REF_KEY: @@ -1633,7 +1686,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* These trees must never be empty */ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { @@ -1642,12 +1694,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) owner); return -EUCLEAN; } + /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; } + + /* EXTENT_TREE_V2 can have empty extent trees. */ + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { + generic_err(leaf, 0, + "invalid root, root %llu must never be empty", + owner); + return -EUCLEAN; + } + return 0; } @@ -1667,6 +1732,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; + u64 item_data_end; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1681,6 +1747,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } + item_data_end = (u64)btrfs_item_offset(leaf, slot) + + btrfs_item_size(leaf, slot); /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the @@ -1691,11 +1759,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset(leaf, slot - 1); - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { + if (unlikely(item_data_end != item_end_expected)) { generic_err(leaf, slot, - "unexpected item end, have %u expect %u", - btrfs_item_data_end(leaf, slot), - item_end_expected); + "unexpected item end, have %llu expect %u", + item_data_end, item_end_expected); return -EUCLEAN; } @@ -1704,12 +1771,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (unlikely(btrfs_item_data_end(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info))) { + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, - "slot end outside of leaf, have %u expect range [0, %u]", - btrfs_item_data_end(leaf, slot), - BTRFS_LEAF_DATA_SIZE(fs_info)); + "slot end outside of leaf, have %llu expect range [0, %u]", + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } @@ -1811,3 +1876,58 @@ out: return ret; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); + +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) +{ + const bool is_subvol = is_fstree(root_owner); + const u64 eb_owner = btrfs_header_owner(eb); + + /* + * Skip dummy fs, as selftests don't create unique ebs for each dummy + * root. + */ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + return 0; + /* + * There are several call sites (backref walking, qgroup, and data + * reloc) passing 0 as @root_owner, as they are not holding the + * tree root. In that case, we can not do a reliable ownership check, + * so just exit. + */ + if (root_owner == 0) + return 0; + /* + * These trees use key.offset as their owner, our callers don't have + * the extra capacity to pass key.offset here. So we just skip them. + */ + if (root_owner == BTRFS_TREE_LOG_OBJECTID || + root_owner == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!is_subvol) { + /* For non-subvolume trees, the eb owner should match root owner */ + if (unlikely(root_owner != eb_owner)) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + root_owner); + return -EUCLEAN; + } + return 0; + } + + /* + * For subvolume trees, owners can mismatch, but they should all belong + * to subvolume trees. + */ + if (unlikely(is_subvol != is_fstree(eb_owner))) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + return 0; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 32fecc9dc1dd..ece497e26558 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1ddbe800897..c3cf3dabe0b1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -22,6 +22,8 @@ #include "zoned.h" #include "inode-item.h" +#define MAX_CONFLICT_INODES 10 + /* magic values for the inode_only field in btrfs_log_inode: * * LOG_INODE_ALL means to log everything @@ -31,8 +33,6 @@ enum { LOG_INODE_ALL, LOG_INODE_EXISTS, - LOG_OTHER_INODE, - LOG_OTHER_INODE_ALL, }; /* @@ -171,7 +171,7 @@ again: int index = (root->log_transid + 1) % 2; if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -194,7 +194,7 @@ again: * writing. */ if (zoned && !created) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } -static int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { filemap_fdatawait_range(buf->pages[0]->mapping, @@ -294,16 +288,6 @@ struct walk_control { */ int free; - /* should we write out the extent buffer? This is used - * while flushing the log tree to disk during a sync - */ - int write; - - /* should we wait for the extent buffer io to finish? Also used - * while flushing the log tree to disk for a sync - */ - int wait; - /* pin only walk, we record which extents on disk belong to the * log trees */ @@ -349,22 +333,20 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; } - if (wc->pin) + if (wc->pin) { ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); + if (ret) + return ret; - if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { - if (wc->pin && btrfs_header_level(eb) == 0) + if (btrfs_buffer_uptodate(eb, gen, 0) && + btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(eb); - if (wc->write) - btrfs_write_tree_block(eb); - if (wc->wait) - btrfs_wait_tree_block_writeback(eb); } return ret; } @@ -819,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0); + &ordered_sums, 0, false); if (ret) goto out; /* @@ -912,11 +894,30 @@ update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: - if (inode) - iput(inode); + iput(inode); return ret; } +static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + const char *name, + int name_len) +{ + int ret; + + ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + if (ret) + return ret; + /* + * Whenever we need to check if a name exists or not, we check the + * fs/subvolume tree. So after an unlink we must run delayed items, so + * that future checks for a name during log replay see that the name + * does not exists anymore. + */ + return btrfs_run_delayed_items(trans); +} + /* * when cleaning up conflicts between the directory names in the * subvolume, directory names in the log and directory names in the @@ -959,12 +960,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, name_len); - if (ret) - goto out; - else - ret = btrfs_run_delayed_items(trans); out: kfree(name); iput(inode); @@ -1066,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen, - int *search_done) + u64 ref_index, char *name, int namelen) { int ret; char *victim_name; @@ -1124,27 +1120,17 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, dir, inode, + ret = unlink_inode_for_log_replay(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans); - if (ret) - return ret; - *search_done = 1; goto again; } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } - - /* - * NOTE: we have searched root tree and checked the - * corresponding ref, it does not need to check again. - */ - *search_done = 1; } btrfs_release_path(path); @@ -1152,7 +1138,9 @@ again: extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, inode_objectid, parent_objectid, 0, 0); - if (!IS_ERR_OR_NULL(extref)) { + if (IS_ERR(extref)) { + return PTR_ERR(extref); + } else if (extref) { u32 item_size; u32 cur_offset = 0; unsigned long base; @@ -1196,27 +1184,22 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, + ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), inode, victim_name, victim_name_len); - if (!ret) - ret = btrfs_run_delayed_items( - trans); } iput(victim_parent); kfree(victim_name); if (ret) return ret; - *search_done = 1; goto again; } kfree(victim_name); next: cur_offset += victim_name_len + sizeof(*extref); } - *search_done = 1; } btrfs_release_path(path); @@ -1358,7 +1341,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1380,107 +1363,6 @@ again: return ret; } -static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, - const u8 ref_type, const char *name, - const int namelen) -{ - struct btrfs_key key; - struct btrfs_path *path; - const u64 parent_id = btrfs_ino(BTRFS_I(dir)); - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = ref_type; - if (key.type == BTRFS_INODE_REF_KEY) - key.offset = parent_id; - else - key.offset = btrfs_extref_hash(parent_id, name, namelen); - - ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, name, namelen); - else - ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen); - -out: - btrfs_free_path(path); - return ret; -} - -static int add_link(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, const char *name, - int namelen, u64 ref_index) -{ - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_dir_item *dir_item; - struct btrfs_key key; - struct btrfs_path *path; - struct inode *other_inode = NULL; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_item = btrfs_lookup_dir_item(NULL, root, path, - btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); - if (!dir_item) { - btrfs_release_path(path); - goto add_link; - } else if (IS_ERR(dir_item)) { - ret = PTR_ERR(dir_item); - goto out; - } - - /* - * Our inode's dentry collides with the dentry of another inode which is - * in the log but not yet processed since it has a higher inode number. - * So delete that other dentry. - */ - btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); - btrfs_release_path(path); - other_inode = read_one_inode(root, key.objectid); - if (!other_inode) { - ret = -ENOENT; - goto out; - } - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); - if (ret) - goto out; - /* - * If we dropped the link count to 0, bump it so that later the iput() - * on the inode will not free it. We will fixup the link count later. - */ - if (other_inode->i_nlink == 0) - inc_nlink(other_inode); - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; -add_link: - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); -out: - iput(other_inode); - btrfs_free_path(path); - - return ret; -} - /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1501,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, char *name = NULL; int namelen; int ret; - int search_done = 0; int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; @@ -1576,51 +1457,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - - if (!search_done) { - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), - BTRFS_I(inode), - inode_objectid, - parent_objectid, - ref_index, name, namelen, - &search_done); - if (ret) { - if (ret == 1) - ret = 0; - goto out; - } - } - - /* - * If a reference item already exists for this inode - * with the same parent and name, but different index, - * drop it and the corresponding directory index entries - * from the parent before adding the new reference item - * and dir index entries, otherwise we would fail with - * -EEXIST returned from btrfs_add_link() below. - */ - ret = btrfs_inode_ref_exists(inode, dir, key->type, - name, namelen); - if (ret > 0) { - ret = btrfs_unlink_inode(trans, - BTRFS_I(dir), - BTRFS_I(inode), - name, namelen); - /* - * If we dropped the link count to 0, bump it so - * that later the iput() on the inode will not - * free it. We will fixup the link count later. - */ - if (!ret && inode->i_nlink == 0) - inc_nlink(inode); - } - if (ret < 0) + ret = __add_inode_ref(trans, root, path, log, + BTRFS_I(dir), BTRFS_I(inode), + inode_objectid, parent_objectid, + ref_index, name, namelen); + if (ret) { + if (ret == 1) + ret = 0; goto out; + } /* insert our name */ - ret = add_link(trans, dir, inode, name, namelen, - ref_index); + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + name, namelen, 0, ref_index); if (ret) goto out; @@ -2300,7 +2149,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_key location; /* - * Currenly we only log dir index keys. Even if we replay a log created + * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). @@ -2350,15 +2199,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, goto out; inc_nlink(inode); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len); - if (ret) - goto out; - - ret = btrfs_run_delayed_items(trans); - if (ret) - goto out; - + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), + name, name_len); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2594,7 +2436,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2805,7 +2647,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen, + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); @@ -2834,7 +2676,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -3141,7 +2983,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; mutex_unlock(&root->log_mutex); goto out; } @@ -3207,6 +3049,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); goto out; } } @@ -3241,7 +3084,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3280,7 +3123,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_wake_log_root; } @@ -3414,6 +3257,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { + /* + * We weren't able to traverse the entire log tree, the + * typical scenario is getting an -EIO when reading an + * extent buffer of the tree, due to a previous writeback + * failure of it. + */ + set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + &log->fs_info->fs_state); + + /* + * Some extent buffers of the log tree may still be dirty + * and not yet written back to storage, because we may + * have updates to a log tree without syncing a log tree, + * such as during rename and link operations. So flush + * them out and wait for their writeback to complete, so + * that we properly cleanup their state and pages. + */ + btrfs_write_marked_extents(log->fs_info, + &log->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + btrfs_wait_tree_log_extents(log, + EXTENT_DIRTY | EXTENT_NEW); + if (trans) btrfs_abort_transaction(trans, ret); else @@ -3454,35 +3320,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. This may often - * return some false positives, because logged_trans is an in memory only field, - * not persisted anywhere. This is meant to be used in contexts where a false - * positive has no functional consequences. + * Check if an inode was logged in the current transaction. This correctly deals + * with the case where the inode was logged but has a logged_trans of 0, which + * happens if the inode is evicted and loaded again, as logged_trans is an in + * memory only field (not persisted). + * + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, + * and < 0 on error. */ -static bool inode_logged(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static int inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path_in) { + struct btrfs_path *path = path_in; + struct btrfs_key key; + int ret; + if (inode->logged_trans == trans->transid) - return true; + return 1; - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) - return false; + /* + * If logged_trans is not 0, then we know the inode logged was not logged + * in this transaction, so we can return false right away. + */ + if (inode->logged_trans > 0) + return 0; /* - * The inode's logged_trans is always 0 when we load it (because it is - * not persisted in the inode item or elsewhere). So if it is 0, the - * inode was last modified in the current transaction then the inode may - * have been logged before in the current transaction, then evicted and - * loaded again in the current transaction - or may have never been logged - * in the current transaction, but since we can not be sure, we have to - * assume it was, otherwise our callers can leave an inconsistent log. + * If no log tree was created for this root in this transaction, then + * the inode can not have been logged in this transaction. In that case + * set logged_trans to anything greater than 0 and less than the current + * transaction's ID, to avoid the search below in a future call in case + * a log tree gets created after this. */ - if (inode->logged_trans == 0 && - inode->last_trans == trans->transid && - !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) - return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { + inode->logged_trans = trans->transid - 1; + return 0; + } - return false; + /* + * We have a log tree and the inode's logged_trans is 0. We can't tell + * for sure if the inode was logged before in this transaction by looking + * only at logged_trans. We could be pessimistic and assume it was, but + * that can lead to unnecessarily logging an inode during rename and link + * operations, and then further updating the log in followup rename and + * link operations, specially if it's a directory, which adds latency + * visible to applications doing a series of rename or link operations. + * + * A logged_trans of 0 here can mean several things: + * + * 1) The inode was never logged since the filesystem was mounted, and may + * or may have not been evicted and loaded again; + * + * 2) The inode was logged in a previous transaction, then evicted and + * then loaded again; + * + * 3) The inode was logged in the current transaction, then evicted and + * then loaded again. + * + * For cases 1) and 2) we don't want to return true, but we need to detect + * case 3) and return true. So we do a search in the log root for the inode + * item. + */ + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + + if (path_in) + btrfs_release_path(path); + else + btrfs_free_path(path); + + /* + * Logging an inode always results in logging its inode item. So if we + * did not find the item we know the inode was not logged for sure. + */ + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* + * Set logged_trans to a value greater than 0 and less then the + * current transaction to avoid doing the search in future calls. + */ + inode->logged_trans = trans->transid - 1; + return 0; + } + + /* + * The inode was previously logged and then evicted, set logged_trans to + * the current transacion's ID, to avoid future tree searches as long as + * the inode is not evicted again. + */ + inode->logged_trans = trans->transid; + + /* + * If it's a directory, then we must set last_dir_index_offset to the + * maximum possible value, so that the next attempt to log the inode does + * not skip checking if dir index keys found in modified subvolume tree + * leaves have been logged before, otherwise it would result in attempts + * to insert duplicate dir index keys in the log tree. This must be done + * because last_dir_index_offset is an in-memory only field, not persisted + * in the inode item or any other on-disk structure, so its value is lost + * once the inode is evicted. + */ + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->last_dir_index_offset = (u64)-1; + + return 1; +} + +/* + * Delete a directory entry from the log if it exists. + * + * Returns < 0 on error + * 1 if the entry does not exists + * 0 if the entry existed and was successfully deleted + */ +static int del_logged_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dir_ino, + const char *name, int name_len, + u64 index) +{ + struct btrfs_dir_item *di; + + /* + * We only log dir index items of a directory, so we don't need to look + * for dir item keys. + */ + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, + index, name, name_len, -1); + if (IS_ERR(di)) + return PTR_ERR(di); + else if (!di) + return 1; + + /* + * We do not need to update the size field of the directory's + * inode item because on log replay we update the field to reflect + * all existing entries in the directory (see overwrite_item()). + */ + return btrfs_delete_one_dir_name(trans, log, path, di); } /* @@ -3511,15 +3498,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, u64 index) { - struct btrfs_root *log; - struct btrfs_dir_item *di; struct btrfs_path *path; int ret; - int err = 0; - u64 dir_ino = btrfs_ino(dir); - if (!inode_logged(trans, dir)) + ret = inode_logged(trans, dir, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3527,41 +3515,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, mutex_lock(&dir->log_mutex); - log = root->log_root; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_unlock; } - /* - * We only log dir index items of a directory, so we don't need to look - * for dir item keys. - */ - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - if (ret) { - err = ret; - goto fail; - } - } - - /* - * We do not need to update the size field of the directory's inode item - * because on log replay we update the field to reflect all existing - * entries in the directory (see overwrite_item()). - */ -fail: + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), + name, name_len, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err < 0) + if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); } @@ -3576,8 +3541,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (!inode_logged(trans, inode)) + ret = inode_logged(trans, inode, NULL); + if (ret == 0) + return; + else if (ret < 0) { + btrfs_set_log_full_commit(trans); return; + } ret = join_running_log_trans(root); if (ret) @@ -3612,11 +3582,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3702,19 +3690,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; - struct extent_buffer *src = path->nodes[0]; - const int nritems = btrfs_header_nritems(src); + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); const u64 ino = btrfs_ino(inode); - const bool inode_logged_before = inode_logged(trans, inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - for (i = path->slots[0]; i < nritems; i++) { + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { + struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3725,7 +3728,32 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, break; } + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); ctx->last_dir_item_offset = key.offset; + + /* + * Skip ranges of items that consist only of dir item keys created + * in past transactions. However if we find a gap, we must log a + * dir index range item for that gap, so that index keys in that + * gap are deleted during log replay. + */ + if (btrfs_dir_transid(src, di) < trans->transid) { + if (key.offset > *last_old_dentry_offset + 1) { + ret = insert_dir_log_key(trans, log, dst_path, + ino, *last_old_dentry_offset + 1, + key.offset - 1); + if (ret < 0) + return ret; + } + + *last_old_dentry_offset = key.offset; + continue; + } + + /* If we logged this dir index item before, we can skip it. */ + if (key.offset <= inode->last_dir_index_offset) + continue; + /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3749,60 +3777,13 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, * resulting in -ENOTEMPTY errors. */ if (!ctx->log_new_dentries) { - struct btrfs_dir_item *di; struct btrfs_key di_key; - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(src, di, &di_key); - if ((btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - di_key.type != BTRFS_ROOT_ITEM_KEY) + if (di_key.type != BTRFS_ROOT_ITEM_KEY) ctx->log_new_dentries = true; } - if (!inode_logged_before) - goto add_to_batch; - - /* - * If we were logged before and have logged dir items, we can skip - * checking if any item with a key offset larger than the last one - * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. - */ - if (key.offset > inode->last_dir_index_offset) - goto add_to_batch; - /* - * Check if the key was already logged before. If not we can add - * it to a batch for bulk insertion. - */ - ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - btrfs_release_path(dst_path); - goto add_to_batch; - } - - /* - * Item exists in the log. Overwrite the item in the log if it - * has different content or do nothing if it has exactly the same - * content. And then flush the current batch if any - do it after - * overwriting the current item, or we would deadlock otherwise, - * since we are holding a path for the existing item. - */ - ret = do_overwrite_item(trans, log, dst_path, src, i, &key); - if (ret < 0) - return ret; - - if (batch_size > 0) { - ret = flush_dir_items_batch(trans, log, src, dst_path, - batch_start, batch_size); - if (ret < 0) - return ret; - batch_size = 0; - } - continue; -add_to_batch: if (batch_size == 0) batch_start = i; batch_size++; @@ -3837,7 +3818,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; int err = 0; int ret; - u64 first_offset = min_offset; + u64 last_old_dentry_offset = min_offset - 1; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); @@ -3871,10 +3852,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) - first_offset = max(min_offset, tmp.offset) + 1; + last_old_dentry_offset = tmp.offset; } goto done; } @@ -3883,17 +3865,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); if (ret == 0) { struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.type == BTRFS_DIR_INDEX_KEY) { - first_offset = tmp.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) { - err = ret; - goto done; - } - } + /* + * The dir index key before the first one we found that needs to + * be logged might be in a previous leaf, and there might be a + * gap between these keys, meaning that we had deletions that + * happened. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the + * previous key's offset plus 1, so that those deletes are replayed. + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; } btrfs_release_path(path); @@ -3915,7 +3898,8 @@ search: * from our directory */ while (1) { - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, + &last_old_dentry_offset); if (ret != 0) { if (ret < 0) err = ret; @@ -3941,14 +3925,16 @@ search: goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { - ctx->last_dir_item_offset = min_key.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &min_key); - if (ret) - err = ret; - else - last_offset = min_key.offset; + /* + * The next leaf was not changed in the current transaction + * and has at least one dir index key. + * We check for the next key because there might have been + * one or more deletions between the last key we logged and + * that next key. So the key range item we log (key type + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's + * offset minus 1, so that those deletes are replayed. + */ + last_offset = min_key.offset - 1; goto done; } if (need_resched()) { @@ -3964,18 +3950,91 @@ done: if (err == 0) { *last_offset_ret = last_offset; /* - * insert the log range keys to indicate where the log - * is valid + * In case the leaf was changed in the current transaction but + * all its dir items are from a past transaction, the last item + * in the leaf is a dir item and there's no gap between that last + * dir item and the first one on the next leaf (which did not + * change in the current transaction), then we don't need to log + * a range, last_old_dentry_offset is == to last_offset. */ - ret = insert_dir_log_key(trans, log, path, ino, first_offset, - last_offset); - if (ret) - err = ret; + ASSERT(last_old_dentry_offset <= last_offset); + if (last_old_dentry_offset < last_offset) { + ret = insert_dir_log_key(trans, log, path, ino, + last_old_dentry_offset + 1, + last_offset); + if (ret) + err = ret; + } } return err; } /* + * If the inode was logged before and it was evicted, then its + * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * key offset. If that's the case, search for it and update the inode. This + * is to avoid lookups in the log tree every time we try to insert a dir index + * key from a leaf changed in the current transaction, and to allow us to always + * do batch insertions of dir index keys. + */ +static int update_last_dir_index_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + const struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_key key; + int ret; + + lockdep_assert_held(&inode->log_mutex); + + if (inode->last_dir_index_offset != (u64)-1) + return 0; + + if (!ctx->logged_before) { + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + return 0; + } + + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + /* + * An error happened or we actually have an index key with an offset + * value of (u64)-1. Bail out, we're done. + */ + if (ret <= 0) + goto out; + + ret = 0; + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + + /* + * No dir index items, bail out and leave last_dir_index_offset with + * the value right before the first valid index value. + */ + if (path->slots[0] == 0) + goto out; + + /* + * btrfs_search_slot() left us at one slot beyond the slot with the last + * index key, or beyond the last key of the directory that is not an + * index key. If we have an index key before, set last_dir_index_offset + * to its offset value, otherwise leave it with a value right before the + * first valid index value, as it means we have an empty directory. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) + inode->last_dir_index_offset = key.offset; + +out: + btrfs_release_path(path); + + return ret; +} + +/* * logging directories is very similar to logging inodes, We find all the items * from the current transaction and write them to the log. * @@ -3997,22 +4056,11 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; - /* - * If this is the first time we are being logged in the current - * transaction, or we were logged before but the inode was evicted and - * reloaded later, in which case its logged_trans is 0, reset the value - * of the last logged key offset. Note that we don't use the helper - * function inode_logged() here - that is because the function returns - * true after an inode eviction, assuming the worst case as it can not - * know for sure if the inode was logged before. So we can not skip key - * searches in the case the inode was evicted, because it may not have - * been logged in this transaction and may have been logged in a past - * transaction, so we need to reset the last dir index offset to (u64)-1. - */ - if (inode->logged_trans != trans->transid) - inode->last_dir_index_offset = (u64)-1; + ret = update_last_dir_index_offset(inode, path, ctx); + if (ret) + return ret; - min_key = 0; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4048,9 +4096,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_key found_key; int start_slot; - if (!inode_logged(trans, inode)) - return 0; - key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -4240,8 +4285,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, - lock_end, &cached_state); + ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); if (ret) return ret; /* @@ -4257,8 +4302,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, - &cached_state); + unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); return ret; } @@ -4270,23 +4315,55 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { - struct btrfs_fs_info *fs_info = trans->fs_info; - unsigned long src_offset; - unsigned long dst_offset; struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct btrfs_inode_item *inode_item; - struct extent_buffer *src = src_path->nodes[0]; - int ret; + struct extent_buffer *src; + int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; char *ins_data; int i; - struct list_head ordered_sums; - int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; + int dst_index; + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); + const u64 i_size = i_size_read(&inode->vfs_inode); - INIT_LIST_HEAD(&ordered_sums); + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); @@ -4298,28 +4375,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; batch.total_data_size = 0; - batch.nr = nr; + batch.nr = 0; + dst_index = 0; for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size(src, i + start_slot); - batch.total_data_size += ins_sizes[i]; - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + const int src_slot = start_slot + i; + struct btrfs_root *csum_root; + struct btrfs_ordered_sum *sums; + struct btrfs_ordered_sum *sums_next; + LIST_HEAD(ordered_sums); + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + u64 extent_num_bytes; + bool is_old_extent; + + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); + + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) + goto add_to_batch; + + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + is_old_extent = (btrfs_file_extent_generation(src, extent) < + trans->transid); + + /* + * Don't copy extents from past generations. That would make us + * log a lot more metadata for common cases like doing only a + * few random writes into a file and then fsync it for the first + * time or after the full sync flag is set on the inode. We can + * get leaves full of extent items, most of which are from past + * generations, so we can skip them - as long as the inode has + * not been the target of a reflink operation in this transaction, + * as in that case it might have had file extent items with old + * generations copied into it. We also must always log prealloc + * extents that start at or beyond eof, otherwise we would lose + * them on log replay. + */ + if (is_old_extent && + ins_keys[dst_index].offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + + if (skip_csum) + goto add_to_batch; + + /* Only regular extents have checksums. */ + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) + goto add_to_batch; + + /* + * If it's an extent created in a past transaction, then its + * checksums are already accessible from the committed csum tree, + * no need to log them. + */ + if (is_old_extent) + goto add_to_batch; + + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); + /* If it's an explicit hole, there are no checksums. */ + if (disk_bytenr == 0) + goto add_to_batch; + + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); + + if (btrfs_file_extent_compression(src, extent)) { + extent_offset = 0; + extent_num_bytes = disk_num_bytes; + } else { + extent_offset = btrfs_file_extent_offset(src, extent); + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); + } + + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); + disk_bytenr += extent_offset; + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); + if (ret) + goto out; + + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { + if (!ret) + ret = log_csums(trans, inode, log, sums); + list_del(&sums->list); + kfree(sums); + } + if (ret) + goto out; + +add_to_batch: + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); + batch.total_data_size += ins_sizes[dst_index]; + batch.nr++; + dst_index++; } + + /* + * We have a leaf full of old extent items that don't need to be logged, + * so we don't need to do anything. + */ + if (batch.nr == 0) + goto out; + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); - if (ret) { - kfree(ins_data); - return ret; - } + if (ret) + goto out; + + dst_index = 0; + for (i = 0; i < nr; i++) { + const int src_slot = start_slot + i; + const int dst_slot = dst_path->slots[0] + dst_index; + struct btrfs_key key; + unsigned long src_offset; + unsigned long dst_offset; - for (i = 0; i < nr; i++, dst_path->slots[0]++) { - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], - dst_path->slots[0]); + /* + * We're done, all the remaining items in the source leaf + * correspond to old file extent items. + */ + if (dst_index >= batch.nr) + break; + + btrfs_item_key_to_cpu(src, &key, src_slot); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto copy_item; - src_offset = btrfs_item_ptr_offset(src, start_slot + i); + extent = btrfs_item_ptr(src, src_slot, + struct btrfs_file_extent_item); + + /* See the comment in the previous loop, same logic. */ + if (btrfs_file_extent_generation(src, extent) < trans->transid && + key.offset < i_size && + inode->last_reflink_trans < trans->transid) + continue; + +copy_item: + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); + src_offset = btrfs_item_ptr_offset(src, src_slot); + + if (key.type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *inode_item; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(dst_path->nodes[0], - dst_path->slots[0], + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, &inode->vfs_inode, @@ -4327,71 +4528,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); + src_offset, ins_sizes[dst_index]); } - /* take a reference on file data extents so that truncates - * or deletes of this inode don't have to relog the inode - * again - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && - !skip_csum) { - int found_type; - extent = btrfs_item_ptr(src, start_slot + i, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(src, extent) < trans->transid) - continue; - - found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - struct btrfs_root *csum_root; - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent); - if (btrfs_file_extent_compression(src, - extent)) { - cs = 0; - cl = dl; - } - - csum_root = btrfs_csum_root(fs_info, ds); - ret = btrfs_lookup_csums_range(csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums, 0); - if (ret) - break; - } - } + dst_index++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); btrfs_release_path(dst_path); +out: kfree(ins_data); - /* - * we have to do this after the loop above to avoid changing the - * log tree while trying to change the log tree. - */ - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - if (!ret) - ret = log_csums(trans, inode, log, sums); - list_del(&sums->list); - kfree(sums); - } - return ret; } @@ -4502,7 +4649,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0); + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; @@ -4527,14 +4674,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans, { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = inode->root->log_root; - struct btrfs_file_extent_item *fi; + struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; - struct btrfs_map_token token; struct btrfs_key key; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + btrfs_set_stack_file_extent_generation(&fi, trans->transid); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); + else + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - + extent_offset); + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); + } + + btrfs_set_stack_file_extent_offset(&fi, extent_offset); + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4548,12 +4715,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * are small, with a root at level 2 or 3 at most, due to their short * life span. */ - if (inode_logged(trans, inode)) { + if (ctx->logged_before) { drop_args.path = path; drop_args.start = em->start; drop_args.end = em->start + em->len; drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); + drop_args.extent_item_size = sizeof(fi); ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; @@ -4565,44 +4732,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans, key.offset = em->start; ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); + sizeof(fi)); if (ret) return ret; } leaf = path->nodes[0]; - btrfs_init_map_token(&token, leaf); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(&token, fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_PREALLOC); - else - btrfs_set_token_file_extent_type(&token, fi, - BTRFS_FILE_EXTENT_REG); - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, - em->block_start - - extent_offset); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); - } else { - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); - } - - btrfs_set_token_file_extent_offset(&token, fi, extent_offset); - btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); - btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); - btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); - btrfs_set_token_file_extent_encryption(&token, fi, 0); - btrfs_set_token_file_extent_other_encoding(&token, fi, 0); + write_extent_buffer(leaf, &fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(fi)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4612,7 +4749,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -4816,7 +4953,6 @@ process: WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); if (ret) @@ -5025,10 +5161,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, * leafs from the log root. */ btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, - 0, hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, + ino, prev_extent_end, + hole_len); if (ret < 0) return ret; @@ -5057,10 +5192,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, btrfs_release_path(path); hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, 0, - hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, ino, + prev_extent_end, hole_len); if (ret < 0) return ret; } @@ -5203,111 +5336,461 @@ out: return ret; } +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(const struct btrfs_trans_handle *trans, + const struct btrfs_inode *inode) +{ + /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. + * See process_dir_items_leaf() for details about why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not acquire their VFS lock, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not acquiring the VFS lock of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * do_overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = start_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + u64 ino = btrfs_ino(start_inode); + int ret = 0; + + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (true) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + bool continue_curr_inode = true; + int nritems; + int i; + + min_key.objectid = ino; + min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + goto next; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { + continue_curr_inode = false; + break; + } + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + btrfs_release_path(path); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto out; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(di_inode); + break; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), + log_mode, ctx); + btrfs_add_delayed_iput(di_inode); + if (ret) + goto out; + if (ctx->log_new_dentries) { + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + ret = -ENOMEM; + goto out; + } + dir_elem->ino = di_key.objectid; + list_add_tail(&dir_elem->list, &dir_list); + } + break; + } + + if (continue_curr_inode && min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } + +next: + if (list_empty(&dir_list)) + break; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); + ino = dir_elem->ino; + list_del(&dir_elem->list); + kfree(dir_elem); + } +out: + btrfs_free_path(path); + if (ret) { + struct btrfs_dir_list *next; + + list_for_each_entry_safe(dir_elem, next, &dir_list, list) + kfree(dir_elem); + } + + return ret; +} + struct btrfs_ino_list { u64 ino; u64 parent; struct list_head list; }; -static int log_conflicting_inodes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - u64 ino, u64 parent) +static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ino_list *curr; + struct btrfs_ino_list *next; + + list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { + list_del(&curr->list); + kfree(curr); + } +} + +static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, + struct btrfs_path *path) +{ + struct btrfs_key key; + int ret; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON_ONCE(ret > 0)) { + /* + * We have previously found the inode through the commit root + * so this should not happen. If it does, just error out and + * fallback to a transaction commit. + */ + ret = -ENOENT; + } else if (ret == 0) { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) + ret = 1; + } + + btrfs_release_path(path); + path->search_commit_root = 0; + path->skip_locking = 0; + + return ret; +} + +static int add_conflicting_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 ino, u64 parent, + struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - LIST_HEAD(inode_list); - int ret = 0; + struct inode *inode; + + /* + * It's rare to have a lot of conflicting inodes, in practice it is not + * common to have more than 1 or 2. We don't want to collect too many, + * as we could end up logging too many inodes (even if only in + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + return BTRFS_LOG_FORCE_COMMIT; + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was deleted in + * the current transaction then we either: + * + * 1) Log the parent directory (later after adding it to the list) if + * the inode is a directory. This is because it may be a deleted + * subvolume/snapshot or it may be a regular directory that had + * deleted subvolumes/snapshots (or subdirectories that had them), + * and at the moment we can't deal with dropping subvolumes/snapshots + * during log replay. So we just log the parent, which will result in + * a fallback to a transaction commit if we are dealing with those + * cases (last_unlink_trans will match the current transaction); + * + * 2) Do nothing if it's not a directory. During log replay we simply + * unlink the conflicting dentry from the parent directory and then + * add the dentry for our inode. Like this we can avoid logging the + * parent directory (and maybe fallback to a transaction commit in + * case it has a last_unlink_trans == trans->transid, due to moving + * some inode from it to some other directory). + */ + if (IS_ERR(inode)) { + int ret = PTR_ERR(inode); + + if (ret != -ENOENT) + return ret; + + ret = conflicting_inode_is_dir(root, ino, path); + /* Not a directory or we got an error. */ + if (ret <= 0) + return ret; + + /* Conflicting inode is a directory, so we'll log its parent. */ + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; + + return 0; + } + + /* + * If the inode was already logged skip it - otherwise we can hit an + * infinite loop. Example: + * + * From the commit root (previous transaction) we have the following + * inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + * + * Here we can use need_log_inode() because we only need to log the + * inode in LOG_INODE_EXISTS mode and rename operations update the log, + * so that the log ends up with the new name and without the old name. + */ + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(inode); + return 0; + } + + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) return -ENOMEM; ino_elem->ino = ino; ino_elem->parent = parent; - list_add_tail(&ino_elem->list, &inode_list); + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; - while (!list_empty(&inode_list)) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct inode *inode; + return 0; +} - ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, - list); - ino = ino_elem->ino; - parent = ino_elem->parent; - list_del(&ino_elem->list); - kfree(ino_elem); - if (ret) - continue; +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; - btrfs_release_path(path); + /* + * Conflicting inodes are logged by the first call to btrfs_log_inode(), + * otherwise we could have unbounded recursion of btrfs_log_inode() + * calls. This check guarantees we can have only 1 level of recursion. + */ + if (ctx->logging_conflict_inodes) + return 0; + + ctx->logging_conflict_inodes = true; + + /* + * New conflicting inodes may be found and added to the list while we + * are logging a conflicting inode, so keep iterating while the list is + * not empty. + */ + while (!list_empty(&ctx->conflict_inodes)) { + struct btrfs_ino_list *curr; + struct inode *inode; + u64 ino; + u64 parent; + + curr = list_first_entry(&ctx->conflict_inodes, + struct btrfs_ino_list, list); + ino = curr->ino; + parent = curr->parent; + list_del(&curr->list); + kfree(curr); inode = btrfs_iget(fs_info->sb, ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent - * directory. + * directory. See the comment at add_conflicting_inode(). */ if (IS_ERR(inode)) { ret = PTR_ERR(inode); - if (ret == -ENOENT) { - inode = btrfs_iget(fs_info->sb, parent, root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - } else { - ret = btrfs_log_inode(trans, - BTRFS_I(inode), - LOG_OTHER_INODE_ALL, - ctx); - btrfs_add_delayed_iput(inode); - } + if (ret != -ENOENT) + break; + + inode = btrfs_iget(fs_info->sb, parent, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + break; } + + /* + * Always log the directory, we cannot make this + * conditional on need_log_inode() because the directory + * might have been logged in LOG_INODE_EXISTS mode or + * the dir index of the conflicting inode is not in a + * dir index key range logged for the directory. So we + * must make sure the deletion is recorded. + */ + ret = btrfs_log_inode(trans, BTRFS_I(inode), + LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); + if (ret) + break; continue; } + /* - * If the inode was already logged skip it - otherwise we can - * hit an infinite loop. Example: - * - * From the commit root (previous transaction) we have the - * following inodes: - * - * inode 257 a directory - * inode 258 with references "zz" and "zz_link" on inode 257 - * inode 259 with reference "a" on inode 257 - * - * And in the current (uncommitted) transaction we have: - * - * inode 257 a directory, unchanged - * inode 258 with references "a" and "a2" on inode 257 - * inode 259 with reference "zz_link" on inode 257 - * inode 261 with reference "zz" on inode 257 - * - * When logging inode 261 the following infinite loop could - * happen if we don't skip already logged inodes: + * Here we can use need_log_inode() because we only need to log + * the inode in LOG_INODE_EXISTS mode and rename operations + * update the log, so that the log ends up with the new name and + * without the old name. * - * - we detect inode 258 as a conflicting inode, with inode 261 - * on reference "zz", and log it; - * - * - we detect inode 259 as a conflicting inode, with inode 258 - * on reference "a", and log it; - * - * - we detect inode 258 as a conflicting inode, with inode 259 - * on reference "zz_link", and log it - again! After this we - * repeat the above steps forever. + * We did this check at add_conflicting_inode(), but here we do + * it again because if some other task logged the inode after + * that, we can avoid doing it again. */ - spin_lock(&BTRFS_I(inode)->lock); - /* - * Check the inode's logged_trans only instead of - * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists (see - * btrfs_log_inode()). - */ - if (BTRFS_I(inode)->logged_trans == trans->transid) { - spin_unlock(&BTRFS_I(inode)->lock); + if (!need_log_inode(trans, BTRFS_I(inode))) { btrfs_add_delayed_iput(inode); continue; } - spin_unlock(&BTRFS_I(inode)->lock); + /* * We are safe logging the other inode without acquiring its * lock as long as we log with the LOG_INODE_EXISTS mode. We @@ -5315,67 +5798,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); - if (ret) { - btrfs_add_delayed_iput(inode); - continue; - } - - key.objectid = ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - btrfs_add_delayed_iput(inode); - continue; - } - - while (true) { - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - u64 other_ino = 0; - u64 other_parent = 0; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != ino || - (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = 0; - break; - } - - ret = btrfs_check_ref_name_override(leaf, slot, &key, - BTRFS_I(inode), &other_ino, - &other_parent); - if (ret < 0) - break; - if (ret > 0) { - ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); - if (!ino_elem) { - ret = -ENOMEM; - break; - } - ino_elem->ino = other_ino; - ino_elem->parent = other_parent; - list_add_tail(&ino_elem->list, &inode_list); - ret = 0; - } - path->slots[0]++; - } + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); + if (ret) + break; } + ctx->logging_conflict_inodes = false; + if (ret) + free_conflicting_inodes(ctx); + return ret; } @@ -5386,11 +5818,11 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_path *dst_path, const u64 logged_isize, - const bool recursive_logging, const int inode_only, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) { + const u64 i_size = i_size_read(&inode->vfs_inode); struct btrfs_root *root = inode->root; int ins_start_slot = 0; int ins_nr = 0; @@ -5411,13 +5843,21 @@ again: if (min_key->type > max_key->type) break; - if (min_key->type == BTRFS_INODE_ITEM_KEY) + if (min_key->type == BTRFS_INODE_ITEM_KEY) { *need_log_inode_item = false; - - if ((min_key->type == BTRFS_INODE_REF_KEY || - min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + (inode->generation == trans->transid || + ctx->logging_conflict_inodes)) { u64 other_ino = 0; u64 other_parent = 0; @@ -5441,17 +5881,16 @@ again: return ret; ins_nr = 0; - ret = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); + btrfs_release_path(path); + ret = add_conflicting_inode(trans, root, path, + other_ino, + other_parent, ctx); if (ret) return ret; - btrfs_release_path(path); goto next_key; } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, @@ -5503,10 +5942,394 @@ next_key: } else { break; } + + /* + * We may process many leaves full of items for our inode, so + * avoid monopolizing a cpu for too long by rescheduling while + * not holding locks on any tree. + */ + cond_resched(); } - if (ins_nr) + if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } + + return ret; +} + +static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + const struct btrfs_item_batch *batch, + const struct btrfs_delayed_item *first_item) +{ + const struct btrfs_delayed_item *curr = first_item; + int ret; + + ret = btrfs_insert_empty_items(trans, log, path, batch); + if (ret) + return ret; + + for (int i = 0; i < batch->nr; i++) { + char *data_ptr; + + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + curr = list_next_entry(curr, log_list); + path->slots[0]++; + } + + btrfs_release_path(path); + + return 0; +} + +static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ + const int max_batch_size = 195; + const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *log = inode->root->log_root; + struct btrfs_item_batch batch = { + .nr = 0, + .total_data_size = 0, + }; + const struct btrfs_delayed_item *first = NULL; + const struct btrfs_delayed_item *curr; + char *ins_data; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + u64 curr_batch_size = 0; + int batch_idx = 0; + int ret; + + /* We are adding dir index items to the log tree. */ + lockdep_assert_held(&inode->log_mutex); + + /* + * We collect delayed items before copying index keys from the subvolume + * to the log tree. However just after we collected them, they may have + * been flushed (all of them or just some of them), and therefore we + * could have copied them from the subvolume tree to the log tree. + * So find the first delayed item that was not yet logged (they are + * sorted by index number). + */ + list_for_each_entry(curr, delayed_ins_list, log_list) { + if (curr->index > inode->last_dir_index_offset) { + first = curr; + break; + } + } + + /* Empty list or all delayed items were already logged. */ + if (!first) + return 0; + + ins_data = kmalloc(max_batch_size * sizeof(u32) + + max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; + batch.data_sizes = ins_sizes; + ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); + batch.keys = ins_keys; + + curr = first; + while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { + const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); + + if (curr_batch_size + curr_size > leaf_data_size || + batch.nr == max_batch_size) { + ret = insert_delayed_items_batch(trans, log, path, + &batch, first); + if (ret) + goto out; + batch_idx = 0; + batch.nr = 0; + batch.total_data_size = 0; + curr_batch_size = 0; + first = curr; + } + + ins_sizes[batch_idx] = curr->data_len; + ins_keys[batch_idx].objectid = ino; + ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; + ins_keys[batch_idx].offset = curr->index; + curr_batch_size += curr_size; + batch.total_data_size += curr->data_len; + batch.nr++; + batch_idx++; + curr = list_next_entry(curr, log_list); + } + + ASSERT(batch.nr >= 1); + ret = insert_delayed_items_batch(trans, log, path, &batch, first); + + curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, + log_list); + inode->last_dir_index_offset = curr->index; +out: + kfree(ins_data); + + return ret; +} + +static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + const struct btrfs_delayed_item *curr; + + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + u64 first_dir_index = curr->index; + u64 last_dir_index; + const struct btrfs_delayed_item *next; + int ret; + + /* + * Find a range of consecutive dir index items to delete. Like + * this we log a single dir range item spanning several contiguous + * dir items instead of logging one range item per dir index item. + */ + next = list_next_entry(curr, log_list); + while (!list_entry_is_head(next, delayed_del_list, log_list)) { + if (next->index != curr->index + 1) + break; + curr = next; + next = list_next_entry(next, log_list); + } + + last_dir_index = curr->index; + ASSERT(last_dir_index >= first_dir_index); + + ret = insert_dir_log_key(trans, inode->root->log_root, path, + ino, first_dir_index, last_dir_index); + if (ret) + return ret; + curr = list_next_entry(curr, log_list); + } + + return 0; +} + +static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + const struct list_head *delayed_del_list, + const struct btrfs_delayed_item *first, + const struct btrfs_delayed_item **last_ret) +{ + const struct btrfs_delayed_item *next; + struct extent_buffer *leaf = path->nodes[0]; + const int last_slot = btrfs_header_nritems(leaf) - 1; + int slot = path->slots[0] + 1; + const u64 ino = btrfs_ino(inode); + + next = list_next_entry(first, log_list); + + while (slot < last_slot && + !list_entry_is_head(next, delayed_del_list, log_list)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) + break; + + slot++; + *last_ret = next; + next = list_next_entry(next, log_list); + } + + return btrfs_del_items(trans, inode->root->log_root, path, + path->slots[0], slot - path->slots[0]); +} + +static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + const struct btrfs_delayed_item *curr; + u64 last_range_start; + u64 last_range_end = 0; + struct btrfs_key key; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + const struct btrfs_delayed_item *last = curr; + u64 first_dir_index = curr->index; + u64 last_dir_index; + bool deleted_items = false; + int ret; + + key.offset = curr->index; + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + if (ret < 0) { + return ret; + } else if (ret == 0) { + ret = batch_delete_dir_index_items(trans, inode, path, ctx, + delayed_del_list, curr, + &last); + if (ret) + return ret; + deleted_items = true; + } + + btrfs_release_path(path); + + /* + * If we deleted items from the leaf, it means we have a range + * item logging their range, so no need to add one or update an + * existing one. Otherwise we have to log a dir range item. + */ + if (deleted_items) + goto next_batch; + + last_dir_index = last->index; + ASSERT(last_dir_index >= first_dir_index); + /* + * If this range starts right after where the previous one ends, + * then we want to reuse the previous range item and change its + * end offset to the end of this range. This is just to minimize + * leaf space usage, by avoiding adding a new range item. + */ + if (last_range_end != 0 && first_dir_index == last_range_end + 1) + first_dir_index = last_range_start; + + ret = insert_dir_log_key(trans, log, path, key.objectid, + first_dir_index, last_dir_index); + if (ret) + return ret; + + last_range_start = first_dir_index; + last_range_end = last_dir_index; +next_batch: + curr = list_next_entry(last, log_list); + } + + return 0; +} + +static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + /* + * We are deleting dir index items from the log tree or adding range + * items to it. + */ + lockdep_assert_held(&inode->log_mutex); + + if (list_empty(delayed_del_list)) + return 0; + + if (ctx->logged_before) + return log_delayed_deletions_incremental(trans, inode, path, + delayed_del_list, ctx); + + return log_delayed_deletions_full(trans, inode, path, delayed_del_list, + ctx); +} + +/* + * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed + * items instead of the subvolume tree. + */ +static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + const bool orig_log_new_dentries = ctx->log_new_dentries; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_item *item; + int ret = 0; + + /* + * No need for the log mutex, plus to avoid potential deadlocks or + * lockdep annotations due to nesting of delayed inode mutexes and log + * mutexes. + */ + lockdep_assert_not_held(&inode->log_mutex); + + ASSERT(!ctx->logging_new_delayed_dentries); + ctx->logging_new_delayed_dentries = true; + + list_for_each_entry(item, delayed_ins_list, log_list) { + struct btrfs_dir_item *dir_item; + struct inode *di_inode; + struct btrfs_key key; + int log_mode = LOG_INODE_EXISTS; + + dir_item = (struct btrfs_dir_item *)item->data; + btrfs_disk_key_to_cpu(&key, &dir_item->location); + + if (key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + break; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(di_inode); + continue; + } + + if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + + if (!ret && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + + btrfs_add_delayed_iput(di_inode); + + if (ret) + break; + } + + ctx->log_new_dentries = orig_log_new_dentries; + ctx->logging_new_delayed_dentries = false; return ret; } @@ -5535,16 +6358,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = inode->root->log_root; - int err = 0; - int ret = 0; + int ret; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; - bool recursive_logging = false; bool inode_item_dropped = true; + bool full_dir_logging = false; + LIST_HEAD(delayed_ins_list); + LIST_HEAD(delayed_del_list); path = btrfs_alloc_path(); if (!path) @@ -5572,27 +6396,69 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) + full_dir_logging = true; + /* - * Only run delayed items if we are a directory. We want to make sure - * all directory indexes hit the fs/subvolume tree so we can find them - * and figure out which index ranges have to be logged. + * If we are logging a directory while we are logging dentries of the + * delayed items of some other inode, then we need to flush the delayed + * items of this directory and not log the delayed items directly. This + * is to prevent more than one level of recursion into btrfs_log_inode() + * by having something like this: + * + * $ mkdir -p a/b/c/d/e/f/g/h/... + * $ xfs_io -c "fsync" a + * + * Where all directories in the path did not exist before and are + * created in the current transaction. + * So in such a case we directly log the delayed items of the main + * directory ("a") without flushing them first, while for each of its + * subdirectories we flush their delayed items before logging them. + * This prevents a potential unbounded recursion like this: + * + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * (...) + * + * We have thresholds for the maximum number of delayed items to have in + * memory, and once they are hit, the items are flushed asynchronously. + * However the limit is quite high, so lets prevent deep levels of + * recursion to happen by limiting the maximum depth to be 1. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) { - err = btrfs_commit_inode_delayed_items(trans, inode); - if (err) + if (full_dir_logging && ctx->logging_new_delayed_dentries) { + ret = btrfs_commit_inode_delayed_items(trans, inode); + if (ret) goto out; } - if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { - recursive_logging = true; - if (inode_only == LOG_OTHER_INODE) - inode_only = LOG_INODE_EXISTS; - else - inode_only = LOG_INODE_ALL; - mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&inode->log_mutex); - } + mutex_lock(&inode->log_mutex); + + /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* + * Before logging the inode item, cache the value returned by + * inode_logged(), because after that we have the need to figure out if + * the inode was previously logged in this transaction. + */ + ret = inode_logged(trans, inode, path); + if (ret < 0) + goto out_unlock; + ctx->logged_before = (ret == 1); + ret = 0; /* * This is for cases where logging a directory could result in losing a @@ -5601,11 +6467,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * to known the file was moved from A to B, so logging just A would * result in losing the file after a log replay. */ - if (S_ISDIR(inode->vfs_inode.i_mode) && - inode_only == LOG_INODE_ALL && - inode->last_unlink_trans >= trans->transid) { + if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - err = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } @@ -5614,14 +6478,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, max_key_type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + BTRFS_XATTR_ITEM_KEY); } else { - if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5635,22 +6497,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - err = logged_inode_size(log, inode, path, &logged_isize); - if (err) + ret = logged_inode_size(log, inode, path, &logged_isize); + if (ret) goto out_unlock; } if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, + inode, max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_logged(trans, inode)) + if (ctx->logged_before) ret = truncate_inode_items(trans, log, inode, 0, 0); } @@ -5660,8 +6523,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_inode_items(trans, log, path, inode, - max_key.type); + if (ctx->logged_before) + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5670,37 +6534,45 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } } - if (ret) { - err = ret; + if (ret) goto out_unlock; - } - err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + /* + * If we are logging a directory in full mode, collect the delayed items + * before iterating the subvolume tree, so that we don't miss any new + * dir index items in case they get flushed while or right after we are + * iterating the subvolume tree. + */ + if (full_dir_logging && !ctx->logging_new_delayed_dentries) + btrfs_log_get_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); + + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, - recursive_logging, inode_only, ctx, + inode_only, ctx, &need_log_inode_item); - if (err) + if (ret) goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, inode, path); - if (err) + ret = btrfs_log_holes(trans, inode, path); + if (ret) goto out_unlock; } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); - if (err) + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); + if (ret) goto out_unlock; /* * If we are doing a fast fsync and the inode was logged before @@ -5711,18 +6583,16 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); - if (err) + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); + if (ret) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); - if (ret) { - err = ret; + if (ret) goto out_unlock; - } } else if (inode_only == LOG_INODE_ALL) { struct extent_map *em, *n; @@ -5732,12 +6602,18 @@ log_extents: write_unlock(&em_tree->lock); } - if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { + if (full_dir_logging) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); - if (ret) { - err = ret; + if (ret) + goto out_unlock; + ret = log_delayed_insertion_items(trans, inode, path, + &delayed_ins_list, ctx); + if (ret) + goto out_unlock; + ret = log_delayed_deletion_items(trans, inode, path, + &delayed_del_list, ctx); + if (ret) goto out_unlock; - } } spin_lock(&inode->lock); @@ -5776,223 +6652,34 @@ log_extents: if (inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); + + /* + * Reset the last_reflink_trans so that the next fsync does not need to + * go through the slower path when logging extents and their checksums. + */ + if (inode_only == LOG_INODE_ALL) + inode->last_reflink_trans = 0; + out_unlock: mutex_unlock(&inode->log_mutex); out: btrfs_free_path(path); btrfs_free_path(dst_path); - return err; -} - -/* - * Check if we need to log an inode. This is used in contexts where while - * logging an inode we need to log another inode (either that it exists or in - * full mode). This is used instead of btrfs_inode_in_log() because the later - * requires the inode to be in the log and have the log transaction committed, - * while here we do not care if the log transaction was already committed - our - * caller will commit the log later - and we want to avoid logging an inode - * multiple times when multiple tasks have joined the same log transaction. - */ -static bool need_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - /* - * If a directory was not modified, no dentries added or removed, we can - * and should avoid logging it. - */ - if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) - return false; - - /* - * If this inode does not have new/updated/deleted xattrs since the last - * time it was logged and is flagged as logged in the current transaction, - * we can skip logging it. As for new/deleted names, those are updated in - * the log by link/unlink/rename operations. - * In case the inode was logged and then evicted and reloaded, its - * logged_trans will be 0, in which case we have to fully log it since - * logged_trans is a transient field, not persisted. - */ - if (inode->logged_trans == trans->transid && - !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) - return false; - - return true; -} - -struct btrfs_dir_list { - u64 ino; - struct list_head list; -}; -/* - * Log the inodes of the new dentries of a directory. See log_dir_items() for - * details about the why it is needed. - * This is a recursive operation - if an existing dentry corresponds to a - * directory, that directory's new entries are logged too (same behaviour as - * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes - * the dentries point to we do not lock their i_mutex, otherwise lockdep - * complains about the following circular lock dependency / possible deadlock: - * - * CPU0 CPU1 - * ---- ---- - * lock(&type->i_mutex_dir_key#3/2); - * lock(sb_internal#2); - * lock(&type->i_mutex_dir_key#3/2); - * lock(&sb->s_type->i_mutex_key#14); - * - * Where sb_internal is the lock (a counter that works as a lock) acquired by - * sb_start_intwrite() in btrfs_start_transaction(). - * Not locking i_mutex of the inodes is still safe because: - * - * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible - * that while logging the inode new references (names) are added or removed - * from the inode, leaving the logged inode item with a link count that does - * not match the number of logged inode reference items. This is fine because - * at log replay time we compute the real number of links and correct the - * link count in the inode item (see replay_one_buffer() and - * link_to_fixup_dir()); - * - * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new index items (key type - * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item - * has a size that doesn't match the sum of the lengths of all the logged - * names - this is ok, not a problem, because at log replay time we set the - * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). - */ -static int log_new_dir_dentries(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *start_inode, - struct btrfs_log_ctx *ctx) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *log = root->log_root; - struct btrfs_path *path; - LIST_HEAD(dir_list); - struct btrfs_dir_list *dir_elem; - int ret = 0; - - /* - * If we are logging a new name, as part of a link or rename operation, - * don't bother logging new dentries, as we just want to log the names - * of an inode and that any new parents exist. - */ - if (ctx->logging_new_name) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); - if (!dir_elem) { - btrfs_free_path(path); - return -ENOMEM; - } - dir_elem->ino = btrfs_ino(start_inode); - list_add_tail(&dir_elem->list, &dir_list); - - while (!list_empty(&dir_list)) { - struct extent_buffer *leaf; - struct btrfs_key min_key; - int nritems; - int i; - - dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, - list); - if (ret) - goto next_dir_inode; - - min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; -again: - btrfs_release_path(path); - ret = btrfs_search_forward(log, &min_key, path, trans->transid); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - -process_leaf: - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - struct btrfs_key di_key; - struct inode *di_inode; - struct btrfs_dir_list *new_dir_elem; - int log_mode = LOG_INODE_EXISTS; - int type; - - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_INDEX_KEY) - goto next_dir_inode; - - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid && - type != BTRFS_FT_DIR) - continue; - btrfs_dir_item_key_to_cpu(leaf, di, &di_key); - if (di_key.type == BTRFS_ROOT_ITEM_KEY) - continue; - - btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); - if (IS_ERR(di_inode)) { - ret = PTR_ERR(di_inode); - goto next_dir_inode; - } + if (ret) + free_conflicting_inodes(ctx); + else + ret = log_conflicting_inodes(trans, inode->root, ctx); - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); - break; - } + if (full_dir_logging && !ctx->logging_new_delayed_dentries) { + if (!ret) + ret = log_new_delayed_dentries(trans, inode, + &delayed_ins_list, ctx); - ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) - log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(di_inode); - if (ret) - goto next_dir_inode; - if (ctx->log_new_dentries) { - new_dir_elem = kmalloc(sizeof(*new_dir_elem), - GFP_NOFS); - if (!new_dir_elem) { - ret = -ENOMEM; - goto next_dir_inode; - } - new_dir_elem->ino = di_key.objectid; - list_add_tail(&new_dir_elem->list, &dir_list); - } - break; - } - if (i == nritems) { - ret = btrfs_next_leaf(log, path); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - goto process_leaf; - } - if (min_key.offset < (u64)-1) { - min_key.offset++; - goto again; - } -next_dir_inode: - list_del(&dir_elem->list); - kfree(dir_elem); + btrfs_log_put_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); } - btrfs_free_path(path); return ret; } @@ -6104,7 +6791,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, root, + ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); btrfs_add_delayed_iput(dir_inode); if (ret) @@ -6322,12 +7009,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, bool log_dentries = false; if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } if (btrfs_root_refs(&root->root_item) == 0) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } @@ -6419,13 +7106,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; if (log_dentries) - ret = log_new_dir_dentries(trans, root, inode, ctx); + ret = log_new_dir_dentries(trans, inode, ctx); else ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } if (ret) @@ -6713,15 +7400,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. +/** + * Update the log after adding a new name for an inode. + * + * @trans: Transaction handle. + * @old_dentry: The dentry associated with the old name and the old + * parent directory. + * @old_dir: The inode of the previous parent directory for the case + * of a rename. For a link operation, it must be NULL. + * @old_dir_index: The index number associated with the old name, meaningful + * only for rename operations (when @old_dir is not NULL). + * Ignored for link operations. + * @parent: The dentry associated with the directory under which the + * new name is located. + * + * Call this after adding a new name for an inode, as a result of a link or + * rename operation, and it will properly update the log to reflect the new name. */ void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent) { + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6734,26 +7438,90 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (!inode_logged(trans, inode) && - (!old_dir || !inode_logged(trans, old_dir))) - return; + ret = inode_logged(trans, inode, NULL); + if (ret < 0) { + goto out; + } else if (ret == 0) { + if (!old_dir) + return; + /* + * If the inode was not logged and we are doing a rename (old_dir is not + * NULL), check if old_dir was logged - if it was not we can return and + * do nothing. + */ + ret = inode_logged(trans, old_dir, NULL); + if (ret < 0) + goto out; + else if (ret == 0) + return; + } + ret = 0; /* * If we are doing a rename (old_dir is not NULL) from a directory that - * was previously logged, make sure the next log attempt on the directory - * is not skipped and logs the inode again. This is because the log may - * not currently be authoritative for a range including the old - * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we - * do not end up with both the new and old dentries around (in case the - * inode is a directory we would have a directory with two hard links and - * 2 inode references for different parents). The next log attempt of - * old_dir will happen at btrfs_log_all_parents(), called through - * btrfs_log_inode_parent() below, because we have previously set - * inode->last_unlink_trans to the current transaction ID, either here or - * at btrfs_record_unlink_dir() in case the inode is a directory. + * was previously logged, make sure that on log replay we get the old + * dir entry deleted. This is needed because we will also log the new + * name of the renamed inode, so we need to make sure that after log + * replay we don't end up with both the new and old dir entries existing. */ - if (old_dir) - old_dir->logged_trans = 0; + if (old_dir && old_dir->logged_trans == trans->transid) { + struct btrfs_root *log = old_dir->root->log_root; + struct btrfs_path *path; + + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; + log_pinned = true; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* + * Other concurrent task might be logging the old directory, + * as it can be triggered when logging other inode that had or + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. + */ + mutex_lock(&old_dir->log_mutex); + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), + old_dentry->d_name.name, + old_dentry->d_name.len, old_dir_index); + if (ret > 0) { + /* + * The dentry does not exist in the log, so record its + * deletion. + */ + btrfs_release_path(path); + ret = insert_dir_log_key(trans, log, path, + btrfs_ino(old_dir), + old_dir_index, old_dir_index); + } + mutex_unlock(&old_dir->log_mutex); + + btrfs_free_path(path); + if (ret < 0) + goto out; + } btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; @@ -6765,5 +7533,17 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + ASSERT(list_empty(&ctx.conflict_inodes)); +out: + /* + * If an error happened mark the log for a full commit because it's not + * consistent and up to date or we couldn't find out if one of the + * inodes was logged before in this transaction. Do it before unpinning + * the log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + if (log_pinned) + btrfs_end_log_trans(root); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f6811c3df38a..aed1e05e9879 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,17 +12,26 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* We can't use the tree log for whatever reason, force a transaction commit */ +#define BTRFS_LOG_FORCE_COMMIT (1) + struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; bool logging_new_name; + bool logging_new_delayed_dentries; + /* Indicate if the inode being logged was logged before. */ + bool logged_before; /* Tracks the last logged dir item/index key offset. */ u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; + struct list_head conflict_inodes; + int num_conflict_inodes; + bool logging_conflict_inodes; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -32,9 +41,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; + ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; } static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) @@ -86,7 +100,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent); + struct dentry *old_dentry, struct btrfs_inode *old_dir, + u64 old_dir_index, struct dentry *parent); #endif diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 90eb5c2830a9..ee00e33c309e 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -659,8 +659,7 @@ rollback: * * Returns the size on success or a negative error code on failure. */ -static int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) { u64 true_size; int ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b07d382d53a8..635f45f1a2ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,8 @@ #include "discard.h" #include "zoned.h" +static struct bio_set btrfs_bioset; + #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -164,24 +166,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); + + if (!profile) + return BTRFS_RAID_SINGLE; + + return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) @@ -194,6 +184,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. @@ -250,13 +247,12 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map); + struct btrfs_io_stripe *smap, + int *mirror_num_ret, int need_raid_map); /* * Device locking @@ -405,7 +401,6 @@ void btrfs_free_device(struct btrfs_device *device) WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); - bio_put(device->flush_bio); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -534,30 +529,20 @@ error: return ret; } -static bool device_path_matched(const char *path, struct btrfs_device *device) -{ - int found; - - rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); - rcu_read_unlock(); - - return found == 0; -} - -/* - * Search and remove all stale (devices which are not mounted) devices. +/** + * Search and remove all stale devices (which are not mounted). * When both inputs are NULL, it will search and release all stale devices. - * path: Optional. When provided will it release all unmounted devices - * matching this path only. - * skip_dev: Optional. Will skip this device when searching for the stale + * + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. + * @skip_device: Optional. Will skip this device when searching for the stale * devices. - * Return: 0 for success or if @path is NULL. - * -EBUSY if @path is a mounted device. - * -ENOENT if @path does not match any device in the list. + * + * Return: 0 for success or if @devt is 0. + * -EBUSY if @devt is a mounted device. + * -ENOENT if @devt does not match any device in the list. */ -static int btrfs_free_stale_devices(const char *path, - struct btrfs_device *skip_device) +static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; @@ -565,7 +550,7 @@ static int btrfs_free_stale_devices(const char *path, lockdep_assert_held(&uuid_mutex); - if (path) + if (devt) ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { @@ -575,13 +560,11 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (devt && devt != device->devt) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ - if (path && ret != 0) + if (devt && ret != 0) ret = -EBUSY; break; } @@ -614,7 +597,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) { - struct request_queue *q; struct block_device *bdev; struct btrfs_super_block *disk_super; u64 devid; @@ -656,8 +638,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - q = bdev_get_queue(bdev); - if (!blk_queue_nonrot(q)) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; device->bdev = bdev; @@ -781,11 +762,17 @@ static noinline struct btrfs_device *device_list_add(const char *path, struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + dev_t path_devt; + int error; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + error = lookup_bdev(path, &path_devt); + if (error) + return ERR_PTR(error); + if (fsid_change_in_progress) { if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); @@ -868,6 +855,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); + device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); fs_devices->num_devices++; @@ -928,25 +916,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, /* * We are going to replace the device path for a given devid, * make sure it's the same device if the device is mounted + * + * NOTE: the device->fs_info may not be reliable here so pass + * in a NULL to message helpers instead. This avoids a possible + * use-after-free when the fs_info and fs_info->sb are already + * torn down. */ if (device->bdev) { - int error; - dev_t path_dev; - - error = lookup_bdev(path, &path_dev); - if (error) { - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(error); - } - - if (device->bdev->bd_dev != path_dev) { + if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - /* - * device->fs_info may not be reliable here, so - * pass in a NULL instead. This avoids a - * possible use-after-free when the fs_info and - * fs_info->sb are already torn down. - */ btrfs_warn_in_rcu(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, @@ -954,7 +932,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(device->fs_info, + btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), path, current->comm, @@ -972,6 +950,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); } + device->devt = path_devt; } /* @@ -1032,6 +1011,18 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) rcu_assign_pointer(device->name, name); } + if (orig_dev->zone_info) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = btrfs_clone_dev_zone_info(orig_dev); + if (!zone_info) { + btrfs_free_device(device); + ret = -ENOMEM; + goto error; + } + device->zone_info = zone_info; + } + list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; @@ -1331,12 +1322,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev return disk_super; } -int btrfs_forget_devices(const char *path) +int btrfs_forget_devices(dev_t devt) { int ret; mutex_lock(&uuid_mutex); - ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + ret = btrfs_free_stale_devices(devt, NULL); mutex_unlock(&uuid_mutex); return ret; @@ -1385,10 +1376,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, } device = device_list_add(path, disk_super, &new_device_added); - if (!IS_ERR(device)) { - if (new_device_added) - btrfs_free_stale_devices(path, device); - } + if (!IS_ERR(device) && new_device_added) + btrfs_free_stale_devices(device->devt, device); btrfs_release_disk_super(disk_super); @@ -1427,12 +1416,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - /* - * We don't want to overwrite the superblock on the drive nor - * any area used by the boot loader (grub for example), so we - * make sure to start at an offset of at least 1MB. - */ - return max_t(u64, start, SZ_1M); + return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular @@ -1914,23 +1898,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1941,21 +1920,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2061,7 +2031,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct page *page; int ret; - disk_super = btrfs_read_dev_one_super(bdev, copy_num); + disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); if (IS_ERR(disk_super)) continue; @@ -2096,12 +2066,18 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 num_devices; int ret = 0; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * The device list in fs_devices is accessed without locks (neither * uuid_mutex nor device_list_mutex) as it won't change on a mounted @@ -2111,7 +2087,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2119,27 +2095,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2152,14 +2123,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2242,7 +2221,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2253,7 +2233,7 @@ error_undo: device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) @@ -2379,8 +2359,11 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) + if (ret) { + btrfs_put_dev_args_from_path(args); return ret; + } + args->devid = btrfs_stack_device_id(&disk_super->dev_item); memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -2606,7 +2589,6 @@ error: int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) { struct btrfs_root *root = fs_info->dev_root; - struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -2668,6 +2650,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->fs_info = fs_info; device->bdev = bdev; + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error_free_device; ret = btrfs_get_dev_zone_info(device, false); if (ret) @@ -2679,7 +2664,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_free_zone; } - q = bdev_get_queue(bdev); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = trans->transid; device->io_width = fs_info->sectorsize; @@ -2727,7 +2711,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(q)) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2814,7 +2798,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * We can ignore the return value as it typically returns -EINVAL and * only succeeds if the device was an alien. */ - btrfs_forget_devices(device_path); + btrfs_forget_devices(device->devt); /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); @@ -3251,6 +3235,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) u64 length; int ret; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + btrfs_err(fs_info, + "relocate: not supported on extent tree v2 yet"); + return -EINVAL; + } + /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset @@ -4078,13 +4068,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; - if (fs_info->sectorsize < PAGE_SIZE && - bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sectorsize %u with page size %lu", - fs_info->sectorsize, PAGE_SIZE); - return false; - } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -4445,10 +4428,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } @@ -5104,26 +5089,16 @@ static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { - u64 type = ctl->type; + struct btrfs_space_info *space_info; - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl->max_stripe_size = SZ_1G; - ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* For larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl->max_stripe_size = SZ_1G; - else - ctl->max_stripe_size = SZ_256M; - ctl->max_chunk_size = ctl->max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl->max_stripe_size = SZ_32M; - ctl->max_chunk_size = 2 * ctl->max_stripe_size; - ctl->devs_max = min_t(int, ctl->devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - BUG(); - } + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = ctl->max_chunk_size; + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), @@ -5306,6 +5281,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, ctl->stripe_size); } + /* Stripe size should not go beyond 1G. */ + ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); + /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; @@ -5631,7 +5609,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, if (ret) goto out; - bg->chunk_item_inserted = 1; + set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); @@ -5753,7 +5731,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; - int ret; + enum btrfs_raid_types index; + int ret = 1; em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) @@ -5766,10 +5745,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) @@ -5781,8 +5761,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - else - ret = 1; free_extent_map(em); down_read(&fs_info->dev_replace.rwsem); @@ -5801,6 +5779,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { @@ -5818,6 +5799,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { @@ -5926,7 +5910,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ sizeof(u64) * (total_stripes), GFP_NOFS|__GFP_NOFAIL); - atomic_set(&bioc->error, 0); refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; @@ -5950,18 +5933,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) kfree(bioc); } -/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ /* * Please note that, discard won't be sent to target device of device * replace. */ -static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - struct btrfs_io_context **bioc_ret) +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) { struct extent_map *em; struct map_lookup *map; - struct btrfs_io_context *bioc; + struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5970,29 +5952,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 stripe_cnt; u64 stripe_len; u64 stripe_offset; - u64 num_stripes; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; - int ret = 0; + int ret; int i; - /* Discard always returns a bioc. */ - ASSERT(bioc_ret); - em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) - return PTR_ERR(em); + return ERR_CAST(em); map = em->map_lookup; + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; - goto out; - } + goto out_free_map; +} offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); @@ -6018,7 +5997,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - num_stripes = 1; + *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -6028,7 +6007,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; - num_stripes = min_t(u64, map->num_stripes, + *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= sub_stripes; @@ -6038,31 +6017,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, last_stripe *= sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = map->num_stripes; + *num_stripes = map->num_stripes; } else { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); } - bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); - if (!bioc) { + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { ret = -ENOMEM; - goto out; + goto out_free_map; } - for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bioc->stripes[i].length = stripes_per_dev * - map->stripe_len; + stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bioc->stripes[i].length += map->stripe_len; + stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -6073,17 +6051,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bioc->stripes[i].length -= stripe_offset; + stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bioc->stripes[i].length -= stripe_end_offset; + stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bioc->stripes[i].length = length; + stripes[i].length = length; } stripe_index++; @@ -6093,12 +6071,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bioc_ret = bioc; - bioc->map_type = map->type; - bioc->num_stripes = num_stripes; -out: free_extent_map(em); - return ret; + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); } /* @@ -6128,7 +6105,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, 0, 0); + logical, &length, &bioc, NULL, NULL, 0); if (ret) { ASSERT(bioc == NULL); return ret; @@ -6189,9 +6166,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) cache = btrfs_lookup_block_group(fs_info, logical); - spin_lock(&cache->lock); - ret = cache->to_copy; - spin_unlock(&cache->lock); + ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -6241,7 +6216,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + i; new->physical = old->physical; - new->length = old->length; new->dev = dev_replace->tgtdev; bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; @@ -6282,8 +6256,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; bioc->tgtdev_map[index_srcdev] = num_stripes; @@ -6326,7 +6298,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, u64 offset; u64 stripe_offset; u64 stripe_nr; - u64 stripe_len; + u32 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; @@ -6337,19 +6309,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe where this block falls in */ - stripe_nr = div64_u64(offset, stripe_len); - /* Offset of stripe in the chunk */ - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, -"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", - stripe_offset, offset, em->start, logical, stripe_len); - return -EINVAL; - } + /* + * Stripe_nr is where this block falls in + * stripe_offset is the offset of this block in its stripe. + */ + stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); + ASSERT(stripe_offset < U32_MAX); - /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); /* Only stripe based profiles needs to check against stripe length. */ @@ -6396,11 +6362,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, return 0; } +static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, + u32 stripe_index, u64 stripe_offset, u64 stripe_nr) +{ + dst->dev = map->stripes[stripe_index].dev; + dst->physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map) + struct btrfs_io_stripe *smap, + int *mirror_num_ret, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6411,6 +6385,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int data_stripes; int i; int ret = 0; + int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; @@ -6511,6 +6486,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6518,9 +6494,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; - max_errors = nr_parity_stripes(map); + max_errors = btrfs_chunk_max_errors(map); - *length = map->stripe_len; + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + data_stripes * stripe_len) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6567,6 +6546,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } + /* + * If this I/O maps to a single device, try to return the device and + * physical block information on the stack instead of allocating an + * I/O context structure. + */ + if (smap && num_alloc_stripes == 1 && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && + (!need_full_stripe(op) || !dev_replace_is_ongoing || + !dev_replace->tgtdev)) { + if (patch_the_first_stripe_for_dev_replace) { + smap->dev = dev_replace->tgtdev; + smap->physical = physical_to_patch_in_first_stripe; + *mirror_num_ret = map->num_stripes + 1; + } else { + set_io_stripe(smap, map, stripe_index, stripe_offset, + stripe_nr); + *mirror_num_ret = mirror_num; + } + *bioc_ret = NULL; + ret = 0; + goto out; + } + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); if (!bioc) { ret = -ENOMEM; @@ -6574,9 +6576,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, + stripe_nr); stripe_index++; } @@ -6643,12 +6644,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, int mirror_num) { - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - mirror_num, 0); + NULL, &mirror_num, 0); } /* For Scrub/replace */ @@ -6656,89 +6653,179 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, + NULL, NULL, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) { - bio->bi_private = bioc->private; - bio->bi_end_io = bioc->end_io; - bio_endio(bio); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} - btrfs_put_bioc(bioc); +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; } -static void btrfs_end_bio(struct bio *bio) +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = + container_of(work, struct btrfs_bio, end_io_work); + + bbio->end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + bbio->end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) { struct btrfs_io_context *bioc = bio->bi_private; - int is_orig_bio = 0; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); if (bio->bi_status) { atomic_inc(&bioc->error); - if (bio->bi_status == BLK_STS_IOERR || - bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_bio(bio)->device; - - ASSERT(dev->bdev); - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_FLUSH_ERRS); - } + btrfs_log_dev_io_error(bio, stripe->dev); } - if (bio == bioc->orig_bio) - is_orig_bio = 1; - - btrfs_bio_counter_dec(bioc->fs_info); + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; - if (atomic_dec_and_test(&bioc->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bioc->orig_bio; - } + bbio->end_io(bbio); + btrfs_put_bioc(bioc); +} - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the btrfs bio - */ - if (atomic_read(&bioc->error) > bioc->max_errors) { - bio->bi_status = BLK_STS_IOERR; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - bio->bi_status = BLK_STS_OK; - } +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; - btrfs_end_bioc(bioc, bio); - } else if (!is_orig_bio) { - bio_put(bio); + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, - u64 physical, struct btrfs_device *dev) +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { - struct btrfs_fs_info *fs_info = bioc->fs_info; + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); - bio->bi_private = bioc; - btrfs_bio(bio)->device = dev; - bio->bi_end_io = btrfs_end_bio; - bio->bi_iter.bi_sector = physical >> 9; /* * For zone append writing, bi_sector must point the beginning of the * zone */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, fs_info->zone_size); + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } else { @@ -6746,78 +6833,54 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, bio->bi_opf |= REQ_OP_WRITE; } } - btrfs_debug_in_rcu(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); - bio_set_dev(bio, dev->bdev); - btrfs_bio_counter_inc_noblocked(fs_info); - - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); } -static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) { - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) { - /* Should be the original bio. */ - WARN_ON(bio != bioc->orig_bio); + struct bio *orig_bio = bioc->orig_bio, *bio; - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - btrfs_end_bioc(bioc, bio); + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num) +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { - struct btrfs_device *dev; - struct bio *first_bio = bio; u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = 0; - u64 map_length; - int ret; - int dev_nr; - int total_devs; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; struct btrfs_io_context *bioc = NULL; - - length = bio->bi_iter.bi_size; - map_length = length; + struct btrfs_io_stripe smap; + int ret; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bioc, mirror_num, 1); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); - } - - total_devs = bioc->num_stripes; - bioc->orig_bio = first_bio; - bioc->private = first_bio->bi_private; - bioc->end_io = first_bio->bi_end_io; - atomic_set(&bioc->stripes_pending, bioc->num_stripes); - - if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } - - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + return; } if (map_length < length) { @@ -6827,25 +6890,31 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, BUG(); } - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bioc->stripes[dev_nr].dev; - if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, - &dev->dev_state) || - (btrfs_op(first_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bioc_error(bioc, first_bio, logical); - continue; - } - - if (dev_nr < total_devs - 1) - bio = btrfs_bio_clone(first_bio); + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); else - bio = first_bio; + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; - submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); } - btrfs_bio_counter_dec(fs_info); - return BLK_STS_OK; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6861,18 +6930,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { - ASSERT((args->devid != (u64)-1) || args->missing); + if (args->missing) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; + } - if ((args->devid != (u64)-1) && device->devid != args->devid) + if (device->devid != args->devid) return false; if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) return false; - if (!args->missing) - return true; - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && - !device->bdev) - return true; - return false; + return true; } /* @@ -6962,16 +7031,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (!dev) return ERR_PTR(-ENOMEM); - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); @@ -7013,11 +7072,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +u64 btrfs_calc_stripe_length(const struct extent_map *em) { - const int data_stripes = calc_data_stripes(type, num_stripes); + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(chunk_len, data_stripes); + return div_u64(em->len, data_stripes); } #if BITS_PER_LONG == 32 @@ -7060,6 +7120,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, } #endif +static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, uuid, true); + return ERR_PTR(-ENOENT); + } + + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); + if (IS_ERR(dev)) { + btrfs_err(fs_info, "failed to init missing device %llu: %ld", + devid, PTR_ERR(dev)); + return dev; + } + btrfs_report_missing_device(fs_info, devid, uuid, false); + + return dev; +} + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -7073,6 +7154,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; + int index; int num_stripes; int ret; int i; @@ -7080,6 +7162,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 @@ -7133,10 +7216,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (<v5.4). + * In that case, it can cause divide-by-zero errors later. + * Since currently sub_stripes is fixed for each profile, let's + * use the trusted value instead. + */ + map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(type, em->len, - map->num_stripes); + em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7147,28 +7237,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, BTRFS_UUID_SIZE); args.uuid = uuid; map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); - if (!map->stripes[i].dev && - !btrfs_test_opt(fs_info, DEGRADED)) { - free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid, true); - return -ENOENT; - } if (!map->stripes[i].dev) { - map->stripes[i].dev = - add_missing_dev(fs_info->fs_devices, devid, - uuid); + map->stripes[i].dev = handle_missing_device(fs_info, + devid, uuid); if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - btrfs_err(fs_info, - "failed to init missing dev %llu: %ld", - devid, PTR_ERR(map->stripes[i].dev)); return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid, false); } + set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &(map->stripes[i].dev->dev_state)); - } write_lock(&map_tree->lock); @@ -7273,7 +7352,8 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = args.devid = btrfs_device_id(leaf, dev_item); + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), @@ -7373,7 +7453,6 @@ static int read_one_dev(struct extent_buffer *leaf, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; @@ -7389,30 +7468,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); + /* - * This will create extent buffer of nodesize, superblock size is - * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will - * overallocate but we can keep it as-is, only the first page is used. + * We allocated a dummy extent, just to use extent buffer accessors. + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but + * that's fine, we will not go beyond system chunk array anyway. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, - root->root_key.objectid, 0); - if (IS_ERR(sb)) - return PTR_ERR(sb); + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; set_extent_buffer_uptodate(sb); - /* - * The sb extent buffer is artificial and just used to read the system array. - * set_extent_buffer_uptodate() call does not properly mark all it's - * pages up-to-date when the page is larger: extent does not cover the - * whole page and consequently check_page_uptodate does not find all - * the page's extents up-to-date (the hole beyond sb), - * write_extent_buffer then triggers a WARN_ON. - * - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, - * but sb spans only this function. Add an explicit SetPageUptodate call - * to silence the warning eg. on PowerPC 64. - */ - if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -7575,6 +7640,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; int ret; int slot; + int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; @@ -7618,30 +7684,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - struct extent_buffer *node; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - break; - } - node = path->nodes[1]; + if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, @@ -7666,7 +7720,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret) goto error; } - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto error; } /* @@ -7674,12 +7732,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { @@ -7698,10 +7756,11 @@ error: return ret; } -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; + int ret = 0; fs_devices->fs_info = fs_info; @@ -7710,12 +7769,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - list_for_each_entry(device, &seed_devs->devices, dev_list) + list_for_each_entry(device, &seed_devs->devices, dev_list) { device->fs_info = fs_info; + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + break; + } seed_devs->fs_info = fs_info; } mutex_unlock(&fs_devices->device_list_mutex); + + return ret; } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7924,11 +7989,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); - btrfs_dev_stat_print_on_error(dev); -} -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) -{ if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, @@ -8070,7 +8131,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } map = em->map_lookup; - stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + stripe_len = btrfs_calc_stripe_length(em); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", @@ -8080,6 +8141,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { @@ -8291,7 +8362,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) static int relocating_repair_kthread(void *data) { - struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; @@ -8299,10 +8370,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); + sb_end_write(fs_info->sb); return -EBUSY; } @@ -8313,7 +8386,7 @@ static int relocating_repair_kthread(void *data) if (!cache) goto out; - if (!cache->relocating_repair) + if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) goto out; ret = btrfs_may_alloc_data_chunk(fs_info, target); @@ -8330,6 +8403,7 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return ret; } @@ -8349,17 +8423,27 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) if (!cache) return true; - spin_lock(&cache->lock); - if (cache->relocating_repair) { - spin_unlock(&cache->lock); + if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { btrfs_put_block_group(cache); return true; } - cache->relocating_repair = 1; - spin_unlock(&cache->lock); kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); return true; } + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 005c9e2a491a..099def5613b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,17 +17,51 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K +/* Used by sanity check for btrfs_raid_types. */ +#define const_ffs(n) (__builtin_ctzll(n) + 1) + +/* + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires + * RAID0 always to be the lowest profile bit. + * Although it's part of on-disk format and should never change, do extra + * compile-time sanity checks. + */ +static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); +static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); + +/* ilog2() can handle both constants and variables */ +#define BTRFS_BG_FLAG_TO_INDEX(profile) \ + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) + +enum btrfs_raid_types { + /* SINGLE is the special one as it doesn't have on-disk bit. */ + BTRFS_RAID_SINGLE = 0, + + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), + + BTRFS_NR_RAID_TYPES +}; + struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; /* offset of logical address in chunk */ u64 offset; /* length of single IO stripe */ - u64 stripe_len; + u32 stripe_len; + /* offset of address in stripe */ + u32 stripe_offset; /* number of stripe where address falls */ u64 stripe_nr; - /* offset of address in stripe */ - u64 stripe_offset; /* offset of raid56 stripe into the chunk */ u64 raid56_stripe_offset; }; @@ -72,6 +106,11 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; + /* + * Device's major-minor number. Must be set even if the device is not + * opened (bdev == NULL), unless the device is missing. + */ + dev_t devt; unsigned long dev_state; blk_status_t last_flush_error; @@ -116,8 +155,8 @@ struct btrfs_device { /* bytes used on the current transaction */ u64 commit_bytes_used; - /* for sending down flush barriers */ - struct bio *flush_bio; + /* Bio used for flushing device barriers */ + struct bio flush_bio; struct completion flush_wait; /* per-device scrub information */ @@ -142,6 +181,31 @@ struct btrfs_device { }; /* + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. + */ +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. + */ + bool is_block_group; + /* + * Only used when 'is_block_group' is true and it is the number of + * extents used by a swapfile for this block group ('ptr' field). + */ + int bg_extent_count; +}; + +/* * If we read those variants at the context of their own lock, we needn't * use the following helpers, reading them directly is safe. */ @@ -316,18 +380,37 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* * Additional info to pass along bio. * * Mostly for btrfs specific features like csum and mirror_num. */ struct btrfs_bio { unsigned int mirror_num; + struct bvec_iter iter; + + /* for direct I/O */ + u64 file_offset; /* @device is for stripe IO submission. */ struct btrfs_device *device; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - struct bvec_iter iter; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For read end I/O handling */ + struct work_struct end_io_work; /* * This member must come last, bio_alloc_bioset will allocate enough @@ -341,6 +424,20 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) return container_of(bio, struct btrfs_bio, bio); } +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { if (bbio->csum != bbio->csum_inline) { @@ -349,10 +446,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) } } +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + struct btrfs_io_stripe { struct btrfs_device *dev; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; u64 physical; - u64 length; /* only used for discard mappings */ + u64 length; }; /* @@ -373,12 +496,9 @@ struct btrfs_io_stripe { */ struct btrfs_io_context { refcount_t refs; - atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ - bio_end_io_t *end_io; struct bio *orig_bio; - void *private; atomic_t error; int max_errors; int num_stripes; @@ -422,7 +542,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - u64 stripe_len; + u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ @@ -491,6 +611,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); @@ -499,13 +622,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); -int btrfs_forget_devices(const char *path); +int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, @@ -549,7 +671,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); @@ -559,6 +681,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); @@ -632,4 +756,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 99abf41b89b9..5bb8d8c86311 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); @@ -271,10 +272,12 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; + int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -293,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path->reada = READA_FORWARD; /* search for our xattrs */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; - struct btrfs_key found_key; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; - /* this is where we start walking through the path */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * if we've reached the last slot in this leaf we need - * to go to the next leaf and reset everything - */ - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) - goto next_item; + continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); @@ -350,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next; if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; + iter_ret = -ERANGE; + break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); @@ -363,12 +345,13 @@ next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } -next_item: - path->slots[0]++; } - ret = total_size; -err: + if (iter_ret < 0) + ret = iter_ret; + else + ret = total_size; + btrfs_free_path(path); return ret; @@ -388,6 +371,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, const char *name, const void *buffer, size_t size, int flags) { + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } @@ -403,10 +389,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); - ret = btrfs_validate_prop(name, value, size); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -416,7 +405,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 767a0c6c9694..b4f44662cda7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - char *data_in; + char *data_in = NULL; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } + return ret; } @@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) done: zlib_inflateEnd(&workspace->strm); if (data_in) - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f559d517c7c4..c9e2b0c85309 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -51,11 +51,13 @@ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* - * Maximum supported zone size. Currently, SMR disks have a zone size of - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not - * expect the zone size to become larger than 8GiB in the near future. + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) @@ -92,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written @@ -132,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, super[i] = page_address(page[i]); } - if (super[0]->generation > super[1]->generation) + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; @@ -350,7 +353,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -402,15 +404,41 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; } nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + * + * With the zoned emulation, we can have non-zoned device on the zoned + * mode. In this case, we don't have a valid max zone append size. So, + * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. + */ + if (bdev_is_zoned(bdev)) { + zone_info->max_zone_append_size = min_t(u64, + (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + } else { + zone_info->max_zone_append_size = + (u64)bdev_max_segments(bdev) << PAGE_SHIFT; + } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -439,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; @@ -558,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } - kfree(zones); + kvfree(zones); switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: @@ -590,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kfree(zones); + kvfree(zones); out_free_zone_info: btrfs_destroy_dev_zone_info(device); @@ -612,6 +640,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) device->zone_info = NULL; } +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) +{ + struct btrfs_zoned_device_info *zone_info; + + zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return NULL; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) + goto out; + + bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, + zone_info->nr_zones); + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) + goto out; + + bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, + zone_info->nr_zones); + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) + goto out; + + bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, + zone_info->nr_zones); + zone_info->zone_cache = NULL; + + return zone_info; + +out: + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->active_zones); + kfree(zone_info); + return NULL; +} + int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { @@ -625,75 +693,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->bdev && + bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found: %pg", + device->bdev); + return -EINVAL; + } + } + + return 0; +} + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 zoned_devices = 0; - u64 nr_devices = 0; u64 zone_size = 0; - const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); - int ret = 0; + u64 max_zone_append_size = 0; + int ret; - /* Count zoned devices */ - list_for_each_entry(device, &fs_devices->devices, dev_list) { - enum blk_zoned_model model; + /* + * Host-Managed devices can't be used without the ZONED flag. With the + * ZONED all devices can be used, using zone emulation if required. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; - model = bdev_zoned_model(device->bdev); - /* - * A Host-Managed zoned device must be used as a zoned device. - * A Host-Aware zoned device and a non-zoned devices can be - * treated as a zoned device, if ZONED flag is enabled in the - * superblock. - */ - if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned) || - (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info = - device->zone_info; - - zone_info = device->zone_info; - zoned_devices++; - if (!zone_size) { - zone_size = zone_info->zone_size; - } else if (zone_info->zone_size != zone_size) { - btrfs_err(fs_info, + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", - device->zone_info->zone_size, - zone_size); - ret = -EINVAL; - goto out; - } + zone_info->zone_size, zone_size); + return -EINVAL; } - nr_devices++; - } - - if (!zoned_devices && !incompat_zoned) - goto out; - - if (!zoned_devices && incompat_zoned) { - /* No zoned block device found on ZONED filesystem */ - btrfs_err(fs_info, - "zoned: no zoned devices found on a zoned filesystem"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices && !incompat_zoned) { - btrfs_err(fs_info, - "zoned: mode not enabled but zoned device found"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices != nr_devices) { - btrfs_err(fs_info, - "zoned: cannot mix zoned and regular devices"); - ret = -EINVAL; - goto out; + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = zone_info->max_zone_append_size; } /* @@ -705,18 +753,20 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); - ret = -EINVAL; - goto out; + return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); - ret = -EINVAL; - goto out; + return -EINVAL; } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned @@ -724,11 +774,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) */ ret = btrfs_check_mountopts_zoned(fs_info); if (ret) - goto out; + return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); -out: - return ret; + return 0; } int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) @@ -1151,7 +1200,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, - u64 *offset_ret) + u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; @@ -1161,6 +1210,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, int ret; u64 length; + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1215,12 +1279,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; - u64 physical = 0; int ret; int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; u64 *caps = NULL; + u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1264,6 +1328,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); + if (!physical) { + ret = -ENOMEM; + goto out; + } + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); if (!active) { ret = -ENOMEM; @@ -1277,19 +1347,26 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; - physical = map->stripes[i].physical; + physical[i] = map->stripes[i].physical; if (device->bdev == NULL) { alloc_offsets[i] = WP_MISSING_DEV; continue; } - is_sequential = btrfs_dev_is_sequential(device, physical); + is_sequential = btrfs_dev_is_sequential(device, physical[i]); if (is_sequential) num_sequential++; else num_conventional++; + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + if (!is_sequential) { alloc_offsets[i] = WP_CONVENTIONAL; continue; @@ -1299,21 +1376,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * This zone will be used for allocation, so mark this zone * non-empty. */ - btrfs_dev_clear_zone_empty(device, physical); + btrfs_dev_clear_zone_empty(device, physical[i]); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); up_read(&dev_replace->rwsem); /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. */ - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical, &zone); + ret = btrfs_get_dev_zone(device, physical[i], &zone); memalloc_nofs_restore(nofs_flag); if (ret == -EIO || ret == -EOPNOTSUPP) { ret = 0; @@ -1339,7 +1416,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) case BLK_ZONE_COND_READONLY: btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical >> device->zone_info->zone_size_shift, + physical[i] >> device->zone_info->zone_size_shift, rcu_str_deref(device->name), device->devid); alloc_offsets[i] = WP_MISSING_DEV; break; @@ -1356,45 +1433,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) __set_bit(i, active); break; } - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); } if (num_sequential > 0) cache->seq_zone = true; if (num_conventional > 0) { - /* - * Avoid calling calculate_alloc_pointer() for new BG. It - * is no use for new BG. It must be always 0. - * - * Also, we have a lock chain of extent buffer lock -> - * chunk mutex. For new BG, this function is called from - * btrfs_make_block_group() which is already taking the - * chunk mutex. Thus, we cannot call - * calculate_alloc_pointer() which takes extent buffer - * locks to avoid deadlock. - */ - /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; - if (new) { - cache->alloc_offset = 0; - goto out; - } - ret = calculate_alloc_pointer(cache, &last_alloc); - if (ret || map->num_stripes == num_conventional) { - if (!ret) - cache->alloc_offset = last_alloc; - else - btrfs_err(fs_info, + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", - cache->start); + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } @@ -1404,15 +1459,54 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (alloc_offsets[0] == WP_MISSING_DEV) { btrfs_err(fs_info, "zoned: cannot recover write pointer for zone %llu", - physical); + physical[0]); ret = -EIO; goto out; } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = caps[0]; - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); break; case BTRFS_BLOCK_GROUP_DUP: + if (map->type & BTRFS_BLOCK_GROUP_DATA) { + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); + ret = -EINVAL; + goto out; + } + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[0]); + ret = -EIO; + goto out; + } + if (alloc_offsets[1] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical[1]); + ret = -EIO; + goto out; + } + if (alloc_offsets[0] != alloc_offsets[1]) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + ret = -EIO; + goto out; + } + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(cache)) { + ret = -EIO; + goto out; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &cache->runtime_flags); + } + cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = min(caps[0], caps[1]); + break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID0: case BTRFS_BLOCK_GROUP_RAID10: @@ -1426,13 +1520,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - if (cache->zone_is_active) { - btrfs_get_block_group(cache); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } - out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1457,14 +1544,21 @@ out: ret = -EIO; } - if (!ret) + if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - - if (ret) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { kfree(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); + kfree(physical); kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1485,7 +1579,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; @@ -1685,12 +1778,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1709,7 +1804,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1759,7 +1855,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, map = em->map_lookup; /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); device = map->stripes[0].dev; free_extent_map(em); @@ -1777,52 +1872,56 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; bool ret; + int i; if (!btrfs_is_zoned(block_group->fs_info)) return true; map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); - device = map->stripes[0].dev; - physical = map->stripes[0].physical; - - if (device->zone_info->max_active_zones == 0) - return true; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); - - if (block_group->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { + if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } - if (!btrfs_dev_set_active_zone(device, physical)) { - /* Cannot activate the zone */ - ret = false; - goto out_unlock; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->zone_info->max_active_zones == 0) + continue; + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } } /* Successfully activated all the zones */ - block_group->zone_is_active = 1; - + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(list_empty(&block_group->active_bg_list)); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -1830,115 +1929,169 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } -int btrfs_zone_finish(struct btrfs_block_group *block_group) +static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; - int ret = 0; - - if (!btrfs_is_zoned(fs_info)) - return 0; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } - map = block_group->physical_map; - /* Currently support SINGLE profile only */ - ASSERT(map->num_stripes == 1); + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} - if (device->zone_info->max_active_zones == 0) - return 0; +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + int ret = 0; + int i; spin_lock(&block_group->lock); - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && - block_group->alloc_offset > block_group->meta_write_pointer) { + if (is_metadata && + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } - spin_unlock(&block_group->lock); - - ret = btrfs_inc_block_group_ro(block_group, false); - if (ret) - return ret; - - /* Ensure all writes in this block group finish */ - btrfs_wait_block_group_reservations(block_group); - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); - - spin_lock(&block_group->lock); /* - * Bail out if someone already deactivated the block group, or - * allocated space is left in the block group. + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. */ - if (!block_group->zone_is_active) { + if (!fully_written) { spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return 0; - } - if (block_group->reserved) { - spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return -EAGAIN; + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } } - block_group->zone_is_active = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, - GFP_NOFS); - btrfs_dec_block_group_ro(block_group); + map = block_group->physical_map; + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; - if (!ret) { - btrfs_dev_clear_active_zone(device, physical); + if (device->zone_info->max_active_zones == 0) + continue; - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); - /* For active_bg_list */ - btrfs_put_block_group(block_group); + if (ret) + return ret; + + btrfs_dev_clear_active_zone(device, physical); } - return ret; + if (!fully_written) + btrfs_dec_block_group_ro(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + + return 0; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); } bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { + struct btrfs_fs_info *fs_info = fs_devices->fs_info; struct btrfs_device *device; bool ret = false; - if (!btrfs_is_zoned(fs_devices->fs_info)) + if (!btrfs_is_zoned(fs_info)) return true; - /* Non-single profiles are not supported yet */ - ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); - /* Check if there is a device with active zones left */ - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + mutex_lock(&fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; if (!device->bdev) @@ -1950,7 +2103,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } } - mutex_unlock(&fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return ret; } @@ -1958,9 +2114,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; @@ -1968,42 +2122,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); - if (logical + length < block_group->start + block_group->zone_capacity) - goto out; - - spin_lock(&block_group->lock); + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; - if (!block_group->zone_is_active) { - spin_unlock(&block_group->lock); + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) goto out; - } - block_group->zone_is_active = 0; - /* We should have consumed all the free space */ - ASSERT(block_group->alloc_offset == block_group->zone_capacity); - ASSERT(block_group->free_space_ctl->free_space == 0); - btrfs_clear_treelog_bg(block_group); - btrfs_clear_data_reloc_bg(block_group); - spin_unlock(&block_group->lock); + do_zone_finish(block_group, true); - map = block_group->physical_map; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; +out: + btrfs_put_block_group(block_group); +} - if (!device->zone_info->max_active_zones) - goto out; +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); - btrfs_dev_clear_active_zone(device, physical); + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; - btrfs_put_block_group(block_group); + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } -out: - btrfs_put_block_group(block_group); + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2033,3 +2197,153 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &bg->runtime_flags)) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index cbf016a7bb5d..8bd16d40b7c6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,11 +10,7 @@ #include "block-group.h" #include "btrfs_inode.h" -/* - * Block groups with more than this value (percents) of unusable space will be - * scheduled for background reclaim. - */ -#define BTRFS_DEFAULT_RECLAIM_THRESH 75 +#define BTRFS_DEFAULT_RECLAIM_THRESH (75) struct btrfs_zoned_device_info { /* @@ -23,6 +19,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -39,6 +36,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, @@ -76,8 +74,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -98,6 +104,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } +/* + * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call + * into btrfs_clone_dev_zone_info() so it's safe to return NULL here. + */ +static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info( + struct btrfs_device *orig_dev) +{ + return NULL; +} + static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) { if (!btrfs_is_zoned(fs_info)) @@ -233,9 +249,34 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { } +static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + return false; +} + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -359,7 +400,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_lock(&inode->vfs_inode, 0); + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); } static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) @@ -367,7 +408,13 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_unlock(&inode->vfs_inode, 0); + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); +} + +static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) +{ + ASSERT(btrfs_is_zoned(bg->fs_info)); + return (bg->alloc_offset == bg->zone_capacity); } #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index fc42dd0badd7..35a0224d4eb7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/* - * zstd_reclaim_timer_fn - reclaim timer + +/** + * Timer callback to free unused workspaces. + * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + * + * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - spin_lock_bh(&wsm.lock); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); return; } @@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); } /* @@ -399,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -458,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -473,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); + kunmap_local(workspace->in_buf.src); put_page(in_page); - start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -506,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -518,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -533,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = tot_out; out: *out_pages = nr_pages; - /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); put_page(in_page); } - if (out_page) - kunmap(out_page); return ret; } @@ -563,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -599,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + kunmap_local(workspace->in_buf.src); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -615,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); + kunmap_local(workspace->in_buf.src); return ret; } |